[𝘀𝗽𝗿] changes introduced through rebaseupstream/users/ilovepi/spr/main.lld-fix-handling-of-riscv-tlsdesc-relocations

Created using spr 1.3.4 [skip ci]
author: Cyndy Ishida <cyndy_ishida@apple.com> 2024-03-21 22:47:33 +0000
committer: Paul Kirth <paulkirth@google.com> 2024-03-21 22:47:33 +0000
commit: f324ac802aac76c2423d1bd20a5826bfcd17f5fe (patch)
tree: 64f1c1973a9b6dacd990397b594d5e9ffc9e121c
parent: 48a148ef47662e96d8167944e9653c62a4ed5339 (diff)
parent: dc74bf7a5412df82223f7062d9a6b814abbfca45 (diff)
562 files changed, 16994 insertions, 6364 deletions
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index c246c42b0904..fea132c8fe78 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -59,8 +59,8 @@ clang/test/AST/Interp/ @tbaederr
 /mlir/Dialect/*/Transforms/Bufferize.cpp @matthias-springer
 
 # Linalg Dialect in MLIR.
-/mlir/include/mlir/Dialect/Linalg @dcaballe @nicolasvasilache
-/mlir/lib/Dialect/Linalg @dcaballe @nicolasvasilache
+/mlir/include/mlir/Dialect/Linalg/* @dcaballe @nicolasvasilache
+/mlir/lib/Dialect/Linalg/* @dcaballe @nicolasvasilache
 /mlir/lib/Dialect/Linalg/Transforms/DecomposeLinalgOps.cpp @MaheshRavishankar @nicolasvasilache
 /mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp @MaheshRavishankar @nicolasvasilache
 /mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp @MaheshRavishankar @nicolasvasilache
@@ -77,14 +77,14 @@ clang/test/AST/Interp/ @tbaederr
 /mlir/**/*SME* @banach-space @dcaballe @nicolasvasilache
 /mlir/**/*SVE* @banach-space @dcaballe @nicolasvasilache
 /mlir/**/*VectorInterfaces* @dcaballe @nicolasvasilache
-/mlir/**/*VectorToSCF* @banach-space @dcaballe @nicolasvasilache @matthias-springer
+/mlir/**/*VectorToSCF* @banach-space @dcaballe @matthias-springer @nicolasvasilache
 /mlir/**/*VectorToLLVM* @banach-space @dcaballe @nicolasvasilache
 /mlir/**/*X86Vector* @aartbik @dcaballe @nicolasvasilache
-/mlir/include/mlir/Dialect/Vector @dcaballe @nicolasvasilache
-/mlir/lib/Dialect/Vector @dcaballe @nicolasvasilache
-/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp @MaheshRavishankar @nicolasvasilache
-/mlir/**/*EmulateNarrowType* @hanhanW
+/mlir/include/mlir/Dialect/Vector/* @dcaballe @nicolasvasilache
+/mlir/lib/Dialect/Vector/* @dcaballe @nicolasvasilache
 /mlir/lib/Dialect/Vector/Transforms/* @hanhanW @nicolasvasilache
+/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp @MaheshRavishankar @nicolasvasilache
+/mlir/**/*EmulateNarrowType* @dcaballe @hanhanW
 
 # Presburger library in MLIR
 /mlir/**/*Presburger* @Groverkss @Superty
@@ -119,3 +119,8 @@ clang/test/AST/Interp/ @tbaederr
 
 # Bazel build system.
 /utils/bazel/ @rupprecht
+
+# InstallAPI and TextAPI
+/llvm/**/TextAPI/ @cyndyishida
+/clang/**/InstallAPI/ @cyndyishida
+/clang/tools/clang-installapi/ @cyndyishida
diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt
index f163d4534287..cc3a70fa35e0 100644
--- a/bolt/CMakeLists.txt
+++ b/bolt/CMakeLists.txt
@@ -45,9 +45,9 @@ if (BOLT_ENABLE_RUNTIME)
   execute_process(COMMAND ls /proc/self/map_files
     RESULT_VARIABLE LS OUTPUT_QUIET ERROR_QUIET)
   if (LS)
-    set(BOLT_ENABLE_RUNTIME OFF)
     message(WARNING
-      "BOLT runtime is disabled as /proc/self/map_files is unreadable.")
+      "BOLT runtime may not be able to read /proc/self/map_files. Please use
+      `--instrumentation-binpath <path-to-instrumented-binary>` option.")
   endif()
 endif()
 
diff --git a/bolt/docs/BAT.md b/bolt/docs/BAT.md
index d1cab984d148..186b0e5ea89d 100644
--- a/bolt/docs/BAT.md
+++ b/bolt/docs/BAT.md
@@ -14,9 +14,8 @@ binary onto the original binary.
 # Usage
 `--enable-bat` flag controls the generation of BAT section. Sampled profile
 needs to be passed along with the optimized binary containing BAT section to
-`perf2bolt` which reads BAT section and produces fdata profile for the original
-binary. Note that YAML profile generation is not supported since BAT doesn't
-contain the metadata for input functions.
+`perf2bolt` which reads BAT section and produces profile for the original
+binary.
 
 # Internals
 ## Section contents
@@ -79,6 +78,7 @@ Hot indices are delta encoded, implicitly starting at zero.
 | ------ | ------| ----------- |
 | `Address` | Continuous, Delta, ULEB128 | Function address in the output binary |
 | `HotIndex` | Delta, ULEB128 | Cold functions only: index of corresponding hot function in hot functions table |
+| `FuncHash` | 8b | Hot functions only: function hash for input function |
 | `NumEntries` | ULEB128 | Number of address translation entries for a function |
 | `EqualElems` | ULEB128 | Hot functions only: number of equal offsets in the beginning of a function |
 | `BranchEntries` | Bitmask, `alignTo(EqualElems, 8)` bits | Hot functions only: if `EqualElems` is non-zero, bitmask denoting entries with `BRANCHENTRY` bit |
@@ -94,6 +94,7 @@ entry is encoded. Input offsets implicitly start at zero.
 | ------ | ------| ----------- |
 | `OutputOffset` | Continuous, Delta, ULEB128 | Function offset in output binary |
 | `InputOffset` | Optional, Delta, SLEB128 | Function offset in input binary with `BRANCHENTRY` LSB bit |
+| `BBHash` | Optional, 8b | Basic block entries only: basic block hash in input binary |
 
 `BRANCHENTRY` bit denotes whether a given offset pair is a control flow source
 (branch or call instruction). If not set, it signifies a control flow target
diff --git a/bolt/include/bolt/Core/MCPlus.h b/bolt/include/bolt/Core/MCPlus.h
index b6a9e73f2347..1d2360c18033 100644
--- a/bolt/include/bolt/Core/MCPlus.h
+++ b/bolt/include/bolt/Core/MCPlus.h
@@ -73,6 +73,7 @@ public:
     kOffset,              /// Offset in the function.
     kLabel,               /// MCSymbol pointing to this instruction.
     kSize,                /// Size of the instruction.
+    kDynamicBranch,       /// Jit instruction patched at runtime.
     kGeneric              /// First generic annotation.
   };
 
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 96b58f541623..198a8d8bf48f 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -1199,6 +1199,16 @@ public:
   /// Set instruction size.
   void setSize(MCInst &Inst, uint32_t Size) const;
 
+  /// Check if the branch instruction could be modified at runtime.
+  bool isDynamicBranch(const MCInst &Inst) const;
+
+  /// Return ID for runtime-modifiable instruction.
+  std::optional<uint32_t> getDynamicBranchID(const MCInst &Inst) const;
+
+  /// Mark instruction as a dynamic branch, i.e. a branch that can be
+  /// overwritten at runtime.
+  void setDynamicBranch(MCInst &Inst, uint32_t ID) const;
+
   /// Return MCSymbol that represents a target of this instruction at a given
   /// operand number \p OpNum. If there's no symbol associated with
   /// the operand - return nullptr.
@@ -1688,6 +1698,13 @@ public:
     llvm_unreachable("not implemented");
   }
 
+  /// Create long conditional branch with a target-specific conditional code
+  /// \p CC.
+  virtual void createLongCondBranch(MCInst &Inst, const MCSymbol *Target,
+                                    unsigned CC, MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+  }
+
   /// Reverses the branch condition in Inst and update its taken target to TBB.
   ///
   /// Returns true on success.
diff --git a/bolt/include/bolt/Profile/BoltAddressTranslation.h b/bolt/include/bolt/Profile/BoltAddressTranslation.h
index 844f0c54e68f..1f53f6d344ad 100644
--- a/bolt/include/bolt/Profile/BoltAddressTranslation.h
+++ b/bolt/include/bolt/Profile/BoltAddressTranslation.h
@@ -115,6 +115,16 @@ public:
   /// Save function and basic block hashes used for metadata dump.
   void saveMetadata(BinaryContext &BC);
 
+  /// Returns BB hash by function output address (after BOLT) and basic block
+  /// input offset.
+  size_t getBBHash(uint64_t FuncOutputAddress, uint32_t BBInputOffset) const;
+
+  /// Returns BF hash by function output address (after BOLT).
+  size_t getBFHash(uint64_t OutputAddress) const;
+
+  /// True if a given \p Address is a function with translation table entry.
+  bool isBATFunction(uint64_t Address) const { return Maps.count(Address); }
+
 private:
   /// Helper to update \p Map by inserting one or more BAT entries reflecting
   /// \p BB for function located at \p FuncAddress. At least one entry will be
@@ -150,6 +160,9 @@ private:
   /// Links outlined cold bocks to their original function
   std::map<uint64_t, uint64_t> ColdPartSource;
 
+  /// Links output address of a main fragment back to input address.
+  std::unordered_map<uint64_t, uint64_t> ReverseMap;
+
   /// Identifies the address of a control-flow changing instructions in a
   /// translation map entry
   const static uint32_t BRANCHENTRY = 0x1;
diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index 5bb4d00024c5..f7840b49199f 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -463,6 +463,10 @@ private:
   /// Dump data structures into a file readable by llvm-bolt
   std::error_code writeAggregatedFile(StringRef OutputFilename) const;
 
+  /// Dump translated data structures into YAML
+  std::error_code writeBATYAML(BinaryContext &BC,
+                               StringRef OutputFilename) const;
+
   /// Filter out binaries based on PID
   void filterBinaryMMapInfo();
 
diff --git a/bolt/include/bolt/Profile/YAMLProfileWriter.h b/bolt/include/bolt/Profile/YAMLProfileWriter.h
index 2d3009ca9175..882748627e7f 100644
--- a/bolt/include/bolt/Profile/YAMLProfileWriter.h
+++ b/bolt/include/bolt/Profile/YAMLProfileWriter.h
@@ -9,6 +9,7 @@
 #ifndef BOLT_PROFILE_YAML_PROFILE_WRITER_H
 #define BOLT_PROFILE_YAML_PROFILE_WRITER_H
 
+#include "bolt/Profile/ProfileYAMLMapping.h"
 #include "llvm/Support/raw_ostream.h"
 #include <system_error>
 
@@ -29,6 +30,9 @@ public:
 
   /// Save execution profile for that instance.
   std::error_code writeProfile(const RewriteInstance &RI);
+
+  static yaml::bolt::BinaryFunctionProfile convert(const BinaryFunction &BF,
+                                                   bool UseDFS);
 };
 
 } // namespace bolt
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index b29ebbbfa18c..267f43f65e20 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -1939,7 +1939,13 @@ void BinaryContext::printInstruction(raw_ostream &OS, const MCInst &Instruction,
     OS << Endl;
     return;
   }
-  InstPrinter->printInst(&Instruction, 0, "", *STI, OS);
+  if (std::optional<uint32_t> DynamicID =
+          MIB->getDynamicBranchID(Instruction)) {
+    OS << "\tjit\t" << MIB->getTargetSymbol(Instruction)->getName()
+       << " # ID: " << DynamicID;
+  } else {
+    InstPrinter->printInst(&Instruction, 0, "", *STI, OS);
+  }
   if (MIB->isCall(Instruction)) {
     if (MIB->isTailCall(Instruction))
       OS << " # TAILCALL ";
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index ce4dd29f542b..fdadef9dcd38 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -3350,6 +3350,16 @@ void BinaryFunction::fixBranches() {
 
       // Eliminate unnecessary conditional branch.
       if (TSuccessor == FSuccessor) {
+        // FIXME: at the moment, we cannot safely remove static key branches.
+        if (MIB->isDynamicBranch(*CondBranch)) {
+          if (opts::Verbosity) {
+            BC.outs()
+                << "BOLT-INFO: unable to remove redundant dynamic branch in "
+                << *this << '\n';
+          }
+          continue;
+        }
+
         BB->removeDuplicateConditionalSuccessor(CondBranch);
         if (TSuccessor != NextBB)
           BB->addBranchInstruction(TSuccessor);
@@ -3358,8 +3368,13 @@ void BinaryFunction::fixBranches() {
 
       // Reverse branch condition and swap successors.
       auto swapSuccessors = [&]() {
-        if (MIB->isUnsupportedBranch(*CondBranch))
+        if (MIB->isUnsupportedBranch(*CondBranch)) {
+          if (opts::Verbosity) {
+            BC.outs() << "BOLT-INFO: unable to swap successors in " << *this
+                      << '\n';
+          }
           return false;
+        }
         std::swap(TSuccessor, FSuccessor);
         BB->swapConditionalSuccessors();
         auto L = BC.scopeLock();
diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp
index bd9bd0c45922..5b14ad5cdb88 100644
--- a/bolt/lib/Core/MCPlusBuilder.cpp
+++ b/bolt/lib/Core/MCPlusBuilder.cpp
@@ -303,6 +303,28 @@ void MCPlusBuilder::setSize(MCInst &Inst, uint32_t Size) const {
   setAnnotationOpValue(Inst, MCAnnotation::kSize, Size);
 }
 
+bool MCPlusBuilder::isDynamicBranch(const MCInst &Inst) const {
+  if (!hasAnnotation(Inst, MCAnnotation::kDynamicBranch))
+    return false;
+  assert(isBranch(Inst) && "Branch expected.");
+  return true;
+}
+
+std::optional<uint32_t>
+MCPlusBuilder::getDynamicBranchID(const MCInst &Inst) const {
+  if (std::optional<int64_t> Value =
+          getAnnotationOpValue(Inst, MCAnnotation::kDynamicBranch)) {
+    assert(isBranch(Inst) && "Branch expected.");
+    return static_cast<uint32_t>(*Value);
+  }
+  return std::nullopt;
+}
+
+void MCPlusBuilder::setDynamicBranch(MCInst &Inst, uint32_t ID) const {
+  assert(isBranch(Inst) && "Branch expected.");
+  setAnnotationOpValue(Inst, MCAnnotation::kDynamicBranch, ID);
+}
+
 bool MCPlusBuilder::hasAnnotation(const MCInst &Inst, unsigned Index) const {
   return (bool)getAnnotationOpValue(Inst, Index);
 }
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index bf1c2ddd37dd..c0ba73108f57 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -107,6 +107,12 @@ static cl::opt<unsigned>
                   cl::desc("print statistics about basic block ordering"),
                   cl::init(0), cl::cat(BoltOptCategory));
 
+static cl::opt<bool> PrintLargeFunctions(
+    "print-large-functions",
+    cl::desc("print functions that could not be overwritten due to excessive "
+             "size"),
+    cl::init(false), cl::cat(BoltOptCategory));
+
 static cl::list<bolt::DynoStats::Category>
     PrintSortedBy("print-sorted-by", cl::CommaSeparated,
                   cl::desc("print functions sorted by order of dyno stats"),
@@ -570,8 +576,12 @@ Error CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
     uint64_t HotSize, ColdSize;
     std::tie(HotSize, ColdSize) =
         BC.calculateEmittedSize(BF, /*FixBranches=*/false);
-    if (HotSize > BF.getMaxSize())
+    if (HotSize > BF.getMaxSize()) {
+      if (opts::PrintLargeFunctions)
+        BC.outs() << "BOLT-INFO: " << BF << " size exceeds allocated space by "
+                  << (HotSize - BF.getMaxSize()) << " bytes\n";
       BF.setSimple(false);
+    }
   };
 
   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
@@ -852,6 +862,10 @@ uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) {
       assert(Result && "internal error analyzing conditional branch");
       assert(CondBranch && "conditional branch expected");
 
+      // Skip dynamic branches for now.
+      if (BF.getBinaryContext().MIB->isDynamicBranch(*CondBranch))
+        continue;
+
       // It's possible that PredBB is also a successor to BB that may have
       // been processed by a previous iteration of the SCTC loop, in which
       // case it may have been marked invalid.  We should skip rewriting in
@@ -1012,6 +1026,10 @@ uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
   const BinaryContext &BC = Function.getBinaryContext();
   for (BinaryBasicBlock &BB : Function) {
     for (MCInst &Inst : BB) {
+      // Skip shortening instructions with Size annotation.
+      if (BC.MIB->getSize(Inst))
+        continue;
+
       MCInst OriginalInst;
       if (opts::Verbosity > 2)
         OriginalInst = Inst;
diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp
index 5477d3b7d1c3..1d61a1b735b4 100644
--- a/bolt/lib/Profile/BoltAddressTranslation.cpp
+++ b/bolt/lib/Profile/BoltAddressTranslation.cpp
@@ -23,6 +23,9 @@ const char *BoltAddressTranslation::SECTION_NAME = ".note.bolt_bat";
 void BoltAddressTranslation::writeEntriesForBB(MapTy &Map,
                                                const BinaryBasicBlock &BB,
                                                uint64_t FuncAddress) {
+  uint64_t HotFuncAddress = ColdPartSource.count(FuncAddress)
+                                ? ColdPartSource[FuncAddress]
+                                : FuncAddress;
   const uint64_t BBOutputOffset =
       BB.getOutputAddressRange().first - FuncAddress;
   const uint32_t BBInputOffset = BB.getInputOffset();
@@ -39,6 +42,9 @@ void BoltAddressTranslation::writeEntriesForBB(MapTy &Map,
   LLVM_DEBUG(dbgs() << "BB " << BB.getName() << "\n");
   LLVM_DEBUG(dbgs() << "  Key: " << Twine::utohexstr(BBOutputOffset)
                     << " Val: " << Twine::utohexstr(BBInputOffset) << "\n");
+  LLVM_DEBUG(dbgs() << formatv(" Hash: {0:x}\n",
+                               getBBHash(HotFuncAddress, BBInputOffset)));
+  (void)HotFuncAddress;
   // In case of conflicts (same Key mapping to different Vals), the last
   // update takes precedence. Of course it is not ideal to have conflicts and
   // those happen when we have an empty BB that either contained only
@@ -72,20 +78,28 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) {
   LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Writing BOLT Address Translation Tables\n");
   for (auto &BFI : BC.getBinaryFunctions()) {
     const BinaryFunction &Function = BFI.second;
+    const uint64_t InputAddress = Function.getAddress();
+    const uint64_t OutputAddress = Function.getOutputAddress();
     // We don't need a translation table if the body of the function hasn't
     // changed
     if (Function.isIgnored() || (!BC.HasRelocations && !Function.isSimple()))
       continue;
 
+    // TBD: handle BAT functions w/multiple entry points.
+    if (Function.isMultiEntry())
+      continue;
+
     LLVM_DEBUG(dbgs() << "Function name: " << Function.getPrintName() << "\n");
     LLVM_DEBUG(dbgs() << " Address reference: 0x"
                       << Twine::utohexstr(Function.getOutputAddress()) << "\n");
+    LLVM_DEBUG(dbgs() << formatv(" Hash: {0:x}\n", getBFHash(OutputAddress)));
 
     MapTy Map;
     for (const BinaryBasicBlock *const BB :
          Function.getLayout().getMainFragment())
       writeEntriesForBB(Map, *BB, Function.getOutputAddress());
     Maps.emplace(Function.getOutputAddress(), std::move(Map));
+    ReverseMap.emplace(OutputAddress, InputAddress);
 
     if (!Function.isSplit())
       continue;
@@ -94,12 +108,12 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) {
     LLVM_DEBUG(dbgs() << " Cold part\n");
     for (const FunctionFragment &FF :
          Function.getLayout().getSplitFragments()) {
+      ColdPartSource.emplace(FF.getAddress(), Function.getOutputAddress());
       Map.clear();
       for (const BinaryBasicBlock *const BB : FF)
         writeEntriesForBB(Map, *BB, FF.getAddress());
 
       Maps.emplace(FF.getAddress(), std::move(Map));
-      ColdPartSource.emplace(FF.getAddress(), Function.getOutputAddress());
     }
   }
 
@@ -109,6 +123,11 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) {
   writeMaps</*Cold=*/true>(Maps, PrevAddress, OS);
 
   BC.outs() << "BOLT-INFO: Wrote " << Maps.size() << " BAT maps\n";
+  const uint64_t NumBBHashes = std::accumulate(
+      FuncHashes.begin(), FuncHashes.end(), 0ull,
+      [](size_t Acc, const auto &B) { return Acc + B.second.second.size(); });
+  BC.outs() << "BOLT-INFO: Wrote " << FuncHashes.size() << " function and "
+            << NumBBHashes << " basic block hashes\n";
 }
 
 APInt BoltAddressTranslation::calculateBranchEntriesBitMask(MapTy &Map,
@@ -155,6 +174,11 @@ void BoltAddressTranslation::writeMaps(std::map<uint64_t, MapTy> &Maps,
     // Only process cold fragments in cold mode, and vice versa.
     if (Cold != ColdPartSource.count(Address))
       continue;
+    // NB: here we use the input address because hashes are saved early (in
+    // `saveMetadata`) before output addresses are assigned.
+    const uint64_t HotInputAddress =
+        ReverseMap[Cold ? ColdPartSource[Address] : Address];
+    std::pair<size_t, BBHashMap> &FuncHashPair = FuncHashes[HotInputAddress];
     MapTy &Map = MapEntry.second;
     const uint32_t NumEntries = Map.size();
     LLVM_DEBUG(dbgs() << "Writing " << NumEntries << " entries for 0x"
@@ -166,6 +190,10 @@ void BoltAddressTranslation::writeMaps(std::map<uint64_t, MapTy> &Maps,
           std::distance(ColdPartSource.begin(), ColdPartSource.find(Address));
       encodeULEB128(HotIndex - PrevIndex, OS);
       PrevIndex = HotIndex;
+    } else {
+      // Function hash
+      LLVM_DEBUG(dbgs() << "Hash: " << formatv("{0:x}\n", FuncHashPair.first));
+      OS.write(reinterpret_cast<char *>(&FuncHashPair.first), 8);
     }
     encodeULEB128(NumEntries, OS);
     // For hot fragments only: encode the number of equal offsets
@@ -197,6 +225,13 @@ void BoltAddressTranslation::writeMaps(std::map<uint64_t, MapTy> &Maps,
       if (Index++ >= EqualElems)
         encodeSLEB128(KeyVal.second - InOffset, OS);
       InOffset = KeyVal.second; // Keeping InOffset as if BRANCHENTRY is encoded
+      if ((InOffset & BRANCHENTRY) == 0) {
+        // Basic block hash
+        size_t BBHash = FuncHashPair.second[InOffset >> 1];
+        OS.write(reinterpret_cast<char *>(&BBHash), 8);
+        LLVM_DEBUG(dbgs() << formatv("{0:x} -> {1:x} {2:x}\n", KeyVal.first,
+                                     InOffset >> 1, BBHash));
+      }
     }
   }
 }
@@ -239,12 +274,18 @@ void BoltAddressTranslation::parseMaps(std::vector<uint64_t> &HotFuncs,
   size_t HotIndex = 0;
   for (uint32_t I = 0; I < NumFunctions; ++I) {
     const uint64_t Address = PrevAddress + DE.getULEB128(&Offset, &Err);
+    uint64_t HotAddress = Cold ? 0 : Address;
     PrevAddress = Address;
     if (Cold) {
       HotIndex += DE.getULEB128(&Offset, &Err);
-      ColdPartSource.emplace(Address, HotFuncs[HotIndex]);
+      HotAddress = HotFuncs[HotIndex];
+      ColdPartSource.emplace(Address, HotAddress);
     } else {
       HotFuncs.push_back(Address);
+      // Function hash
+      const size_t FuncHash = DE.getU64(&Offset, &Err);
+      FuncHashes[Address].first = FuncHash;
+      LLVM_DEBUG(dbgs() << formatv("{0:x}: hash {1:x}\n", Address, FuncHash));
     }
     const uint32_t NumEntries = DE.getULEB128(&Offset, &Err);
     // Equal offsets, hot fragments only.
@@ -288,12 +329,22 @@ void BoltAddressTranslation::parseMaps(std::vector<uint64_t> &HotFuncs,
         InputOffset += InputDelta;
       }
       Map.insert(std::pair<uint32_t, uint32_t>(OutputOffset, InputOffset));
-      LLVM_DEBUG(
-          dbgs() << formatv("{0:x} -> {1:x} ({2}/{3}b -> {4}/{5}b), {6:x}\n",
-                            OutputOffset, InputOffset, OutputDelta,
-                            getULEB128Size(OutputDelta), InputDelta,
-                            (J < EqualElems) ? 0 : getSLEB128Size(InputDelta),
-                            OutputAddress));
+      size_t BBHash = 0;
+      const bool IsBranchEntry = InputOffset & BRANCHENTRY;
+      if (!IsBranchEntry) {
+        BBHash = DE.getU64(&Offset, &Err);
+        // Map basic block hash to hot fragment by input offset
+        FuncHashes[HotAddress].second.emplace(InputOffset >> 1, BBHash);
+      }
+      LLVM_DEBUG({
+        dbgs() << formatv(
+            "{0:x} -> {1:x} ({2}/{3}b -> {4}/{5}b), {6:x}", OutputOffset,
+            InputOffset, OutputDelta, getULEB128Size(OutputDelta), InputDelta,
+            (J < EqualElems) ? 0 : getSLEB128Size(InputDelta), OutputAddress);
+        if (BBHash)
+          dbgs() << formatv(" {0:x}", BBHash);
+        dbgs() << '\n';
+      });
     }
     Maps.insert(std::pair<uint64_t, MapTy>(Address, Map));
   }
@@ -303,7 +354,12 @@ void BoltAddressTranslation::dump(raw_ostream &OS) {
   const size_t NumTables = Maps.size();
   OS << "BAT tables for " << NumTables << " functions:\n";
   for (const auto &MapEntry : Maps) {
-    OS << "Function Address: 0x" << Twine::utohexstr(MapEntry.first) << "\n";
+    const uint64_t Address = MapEntry.first;
+    const uint64_t HotAddress = fetchParentAddress(Address);
+    OS << "Function Address: 0x" << Twine::utohexstr(Address);
+    if (HotAddress == 0)
+      OS << formatv(", hash: {0:x}", getBFHash(Address));
+    OS << "\n";
     OS << "BB mappings:\n";
     for (const auto &Entry : MapEntry.second) {
       const bool IsBranch = Entry.second & BRANCHENTRY;
@@ -312,6 +368,9 @@ void BoltAddressTranslation::dump(raw_ostream &OS) {
          << "0x" << Twine::utohexstr(Val);
       if (IsBranch)
         OS << " (branch)";
+      else
+        OS << formatv(" hash: {0:x}",
+                      getBBHash(HotAddress ? HotAddress : Address, Val));
       OS << "\n";
     }
     OS << "\n";
@@ -439,5 +498,15 @@ void BoltAddressTranslation::saveMetadata(BinaryContext &BC) {
                                                  BB.getHash());
   }
 }
+
+size_t BoltAddressTranslation::getBBHash(uint64_t FuncOutputAddress,
+                                         uint32_t BBInputOffset) const {
+  return FuncHashes.at(FuncOutputAddress).second.at(BBInputOffset);
+}
+
+size_t BoltAddressTranslation::getBFHash(uint64_t OutputAddress) const {
+  return FuncHashes.at(OutputAddress).first;
+}
+
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 6a64bcde911e..37c637a44a0e 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -16,6 +16,7 @@
 #include "bolt/Core/BinaryFunction.h"
 #include "bolt/Profile/BoltAddressTranslation.h"
 #include "bolt/Profile/Heatmap.h"
+#include "bolt/Profile/YAMLProfileWriter.h"
 #include "bolt/Utils/CommandLineOpts.h"
 #include "bolt/Utils/Utils.h"
 #include "llvm/ADT/STLExtras.h"
@@ -85,6 +86,7 @@ MaxSamples("max-samples",
   cl::cat(AggregatorCategory));
 
 extern cl::opt<opts::ProfileFormatKind> ProfileFormat;
+extern cl::opt<std::string> SaveProfile;
 
 cl::opt<bool> ReadPreAggregated(
     "pa", cl::desc("skip perf and read data from a pre-aggregated file format"),
@@ -594,10 +596,21 @@ Error DataAggregator::readProfile(BinaryContext &BC) {
     convertBranchData(Function);
   }
 
-  if (opts::AggregateOnly &&
-      opts::ProfileFormat == opts::ProfileFormatKind::PF_Fdata) {
-    if (std::error_code EC = writeAggregatedFile(opts::OutputFilename))
-      report_error("cannot create output data file", EC);
+  if (opts::AggregateOnly) {
+    if (opts::ProfileFormat == opts::ProfileFormatKind::PF_Fdata)
+      if (std::error_code EC = writeAggregatedFile(opts::OutputFilename))
+        report_error("cannot create output data file", EC);
+
+    // BAT YAML is handled by DataAggregator since normal YAML output requires
+    // CFG which is not available in BAT mode.
+    if (usesBAT()) {
+      if (opts::ProfileFormat == opts::ProfileFormatKind::PF_YAML)
+        if (std::error_code EC = writeBATYAML(BC, opts::OutputFilename))
+          report_error("cannot create output data file", EC);
+      if (!opts::SaveProfile.empty())
+        if (std::error_code EC = writeBATYAML(BC, opts::SaveProfile))
+          report_error("cannot create output data file", EC);
+    }
   }
 
   return Error::success();
@@ -2258,6 +2271,53 @@ DataAggregator::writeAggregatedFile(StringRef OutputFilename) const {
   return std::error_code();
 }
 
+std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
+                                             StringRef OutputFilename) const {
+  std::error_code EC;
+  raw_fd_ostream OutFile(OutputFilename, EC, sys::fs::OpenFlags::OF_None);
+  if (EC)
+    return EC;
+
+  yaml::bolt::BinaryProfile BP;
+
+  // Fill out the header info.
+  BP.Header.Version = 1;
+  BP.Header.FileName = std::string(BC.getFilename());
+  std::optional<StringRef> BuildID = BC.getFileBuildID();
+  BP.Header.Id = BuildID ? std::string(*BuildID) : "<unknown>";
+  BP.Header.Origin = std::string(getReaderName());
+  // Only the input binary layout order is supported.
+  BP.Header.IsDFSOrder = false;
+  // FIXME: Need to match hash function used to produce BAT hashes.
+  BP.Header.HashFunction = HashFunction::Default;
+
+  ListSeparator LS(",");
+  raw_string_ostream EventNamesOS(BP.Header.EventNames);
+  for (const StringMapEntry<std::nullopt_t> &EventEntry : EventNames)
+    EventNamesOS << LS << EventEntry.first().str();
+
+  BP.Header.Flags = opts::BasicAggregation ? BinaryFunction::PF_SAMPLE
+                                           : BinaryFunction::PF_LBR;
+
+  if (!opts::BasicAggregation) {
+    // Convert profile for functions not covered by BAT
+    for (auto &BFI : BC.getBinaryFunctions()) {
+      BinaryFunction &Function = BFI.second;
+      if (!Function.hasProfile())
+        continue;
+      if (BAT->isBATFunction(Function.getAddress()))
+        continue;
+      BP.Functions.emplace_back(
+          YAMLProfileWriter::convert(Function, /*UseDFS=*/false));
+    }
+  }
+
+  // Write the profile.
+  yaml::Output Out(OutFile, nullptr, 0);
+  Out << BP;
+  return std::error_code();
+}
+
 void DataAggregator::dump() const { DataReader::dump(); }
 
 void DataAggregator::dump(const LBREntry &LBR) const {
diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp
index dffd851a1d6f..6fcc4a956fa1 100644
--- a/bolt/lib/Profile/YAMLProfileWriter.cpp
+++ b/bolt/lib/Profile/YAMLProfileWriter.cpp
@@ -10,7 +10,6 @@
 #include "bolt/Core/BinaryBasicBlock.h"
 #include "bolt/Core/BinaryFunction.h"
 #include "bolt/Profile/ProfileReaderBase.h"
-#include "bolt/Profile/ProfileYAMLMapping.h"
 #include "bolt/Rewrite/RewriteInstance.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
@@ -26,15 +25,15 @@ extern llvm::cl::opt<bool> ProfileUseDFS;
 namespace llvm {
 namespace bolt {
 
-namespace {
-void convert(const BinaryFunction &BF,
-             yaml::bolt::BinaryFunctionProfile &YamlBF) {
+yaml::bolt::BinaryFunctionProfile
+YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS) {
+  yaml::bolt::BinaryFunctionProfile YamlBF;
   const BinaryContext &BC = BF.getBinaryContext();
 
   const uint16_t LBRProfile = BF.getProfileFlags() & BinaryFunction::PF_LBR;
 
   // Prepare function and block hashes
-  BF.computeHash(opts::ProfileUseDFS);
+  BF.computeHash(UseDFS);
   BF.computeBlockHashes();
 
   YamlBF.Name = BF.getPrintName();
@@ -44,7 +43,7 @@ void convert(const BinaryFunction &BF,
   YamlBF.ExecCount = BF.getKnownExecutionCount();
 
   BinaryFunction::BasicBlockOrderType Order;
-  llvm::copy(opts::ProfileUseDFS ? BF.dfs() : BF.getLayout().blocks(),
+  llvm::copy(UseDFS ? BF.dfs() : BF.getLayout().blocks(),
              std::back_inserter(Order));
 
   for (const BinaryBasicBlock *BB : Order) {
@@ -106,20 +105,14 @@ void convert(const BinaryFunction &BF,
           TargetName = Callee->getOneName();
         }
 
+        auto getAnnotationWithDefault = [&](const MCInst &Inst, StringRef Ann) {
+          return BC.MIB->getAnnotationWithDefault(Instr, Ann, 0ull);
+        };
         if (BC.MIB->getConditionalTailCall(Instr)) {
-          auto CTCCount =
-              BC.MIB->tryGetAnnotationAs<uint64_t>(Instr, "CTCTakenCount");
-          if (CTCCount) {
-            CSI.Count = *CTCCount;
-            auto CTCMispreds =
-                BC.MIB->tryGetAnnotationAs<uint64_t>(Instr, "CTCMispredCount");
-            if (CTCMispreds)
-              CSI.Mispreds = *CTCMispreds;
-          }
+          CSI.Count = getAnnotationWithDefault(Instr, "CTCTakenCount");
+          CSI.Mispreds = getAnnotationWithDefault(Instr, "CTCMispredCount");
         } else {
-          auto Count = BC.MIB->tryGetAnnotationAs<uint64_t>(Instr, "Count");
-          if (Count)
-            CSI.Count = *Count;
+          CSI.Count = getAnnotationWithDefault(Instr, "Count");
         }
 
         if (CSI.Count)
@@ -165,8 +158,8 @@ void convert(const BinaryFunction &BF,
 
     YamlBF.Blocks.emplace_back(YamlBB);
   }
+  return YamlBF;
 }
-} // end anonymous namespace
 
 std::error_code YAMLProfileWriter::writeProfile(const RewriteInstance &RI) {
   const BinaryContext &BC = RI.getBinaryContext();
@@ -222,9 +215,7 @@ std::error_code YAMLProfileWriter::writeProfile(const RewriteInstance &RI) {
       if (!BF.hasValidProfile() && !RI.getProfileReader()->isTrustedSource())
         continue;
 
-      yaml::bolt::BinaryFunctionProfile YamlBF;
-      convert(BF, YamlBF);
-      BP.Functions.emplace_back(YamlBF);
+      BP.Functions.emplace_back(convert(BF, opts::ProfileUseDFS));
     }
   }
 
diff --git a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
index a2bfd45a64e3..b028a455a6db 100644
--- a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
+++ b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
@@ -14,7 +14,9 @@
 #include "bolt/Rewrite/MetadataRewriter.h"
 #include "bolt/Rewrite/MetadataRewriters.h"
 #include "bolt/Utils/CommandLineOpts.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -66,6 +68,16 @@ static cl::opt<bool> DumpStaticCalls("dump-static-calls",
                                      cl::cat(BoltCategory));
 
 static cl::opt<bool>
+    DumpStaticKeys("dump-static-keys",
+                   cl::desc("dump Linux kernel static keys jump table"),
+                   cl::init(false), cl::Hidden, cl::cat(BoltCategory));
+
+static cl::opt<bool> LongJumpLabels(
+    "long-jump-labels",
+    cl::desc("always use long jumps/nops for Linux kernel static keys"),
+    cl::init(false), cl::Hidden, cl::cat(BoltCategory));
+
+static cl::opt<bool>
     PrintORC("print-orc",
              cl::desc("print ORC unwind information for instructions"),
              cl::init(true), cl::Hidden, cl::cat(BoltCategory));
@@ -151,6 +163,20 @@ class LinuxKernelRewriter final : public MetadataRewriter {
   /// Number of entries in the input file ORC sections.
   uint64_t NumORCEntries = 0;
 
+  /// Section containing static keys jump table.
+  ErrorOr<BinarySection &> StaticKeysJumpSection = std::errc::bad_address;
+  uint64_t StaticKeysJumpTableAddress = 0;
+  static constexpr size_t STATIC_KEYS_JUMP_ENTRY_SIZE = 8;
+
+  struct JumpInfoEntry {
+    bool Likely;
+    bool InitValue;
+  };
+  SmallVector<JumpInfoEntry, 16> JumpInfo;
+
+  /// Static key entries that need nop conversion.
+  DenseSet<uint32_t> NopIDs;
+
   /// Section containing static call table.
   ErrorOr<BinarySection &> StaticCallSection = std::errc::bad_address;
   uint64_t StaticCallTableAddress = 0;
@@ -235,6 +261,11 @@ class LinuxKernelRewriter final : public MetadataRewriter {
   /// Read .pci_fixup
   Error readPCIFixupTable();
 
+  /// Handle static keys jump table.
+  Error readStaticKeysJumpTable();
+  Error rewriteStaticKeysJumpTable();
+  Error updateStaticKeysJumpTablePostEmit();
+
   /// Mark instructions referenced by kernel metadata.
   Error markInstructions();
 
@@ -268,6 +299,9 @@ public:
     if (Error E = readPCIFixupTable())
       return E;
 
+    if (Error E = readStaticKeysJumpTable())
+      return E;
+
     return Error::success();
   }
 
@@ -290,12 +324,18 @@ public:
     if (Error E = rewriteStaticCalls())
       return E;
 
+    if (Error E = rewriteStaticKeysJumpTable())
+      return E;
+
     return Error::success();
   }
 
   Error postEmitFinalizer() override {
     updateLKMarkers();
 
+    if (Error E = updateStaticKeysJumpTablePostEmit())
+      return E;
+
     return Error::success();
   }
 };
@@ -1343,6 +1383,351 @@ Error LinuxKernelRewriter::readPCIFixupTable() {
   return Error::success();
 }
 
+/// Runtime code modification used by static keys is the most ubiquitous
+/// self-modifying feature of the Linux kernel. The idea is to eliminate the
+/// condition check and associated conditional jump on a hot path if that
+/// condition (based on a boolean value of a static key) does not change often.
+/// Whenever the condition changes, the kernel runtime modifies all code paths
+/// associated with that key flipping the code between nop and (unconditional)
+/// jump. The information about the code is stored in a static key jump table
+/// and contains the list of entries of the following type from
+/// include/linux/jump_label.h:
+//
+///   struct jump_entry {
+///     s32 code;
+///     s32 target;
+///     long key; // key may be far away from the core kernel under KASLR
+///   };
+///
+/// The list does not have to be stored in any sorted way, but it is sorted at
+/// boot time (or module initialization time) first by "key" and then by "code".
+/// jump_label_sort_entries() is responsible for sorting the table.
+///
+/// The key in jump_entry structure uses lower two bits of the key address
+/// (which itself is aligned) to store extra information. We are interested in
+/// the lower bit which indicates if the key is likely to be set on the code
+/// path associated with this jump_entry.
+///
+/// static_key_{enable,disable}() functions modify the code based on key and
+/// jump table entries.
+///
+/// jump_label_update() updates all code entries for a given key. Batch mode is
+/// used for x86.
+///
+/// The actual patching happens in text_poke_bp_batch() that overrides the first
+/// byte of the sequence with int3 before proceeding with actual code
+/// replacement.
+Error LinuxKernelRewriter::readStaticKeysJumpTable() {
+  const BinaryData *StaticKeysJumpTable =
+      BC.getBinaryDataByName("__start___jump_table");
+  if (!StaticKeysJumpTable)
+    return Error::success();
+
+  StaticKeysJumpTableAddress = StaticKeysJumpTable->getAddress();
+
+  const BinaryData *Stop = BC.getBinaryDataByName("__stop___jump_table");
+  if (!Stop)
+    return createStringError(errc::executable_format_error,
+                             "missing __stop___jump_table symbol");
+
+  ErrorOr<BinarySection &> ErrorOrSection =
+      BC.getSectionForAddress(StaticKeysJumpTableAddress);
+  if (!ErrorOrSection)
+    return createStringError(errc::executable_format_error,
+                             "no section matching __start___jump_table");
+
+  StaticKeysJumpSection = *ErrorOrSection;
+  if (!StaticKeysJumpSection->containsAddress(Stop->getAddress() - 1))
+    return createStringError(errc::executable_format_error,
+                             "__stop___jump_table not in the same section "
+                             "as __start___jump_table");
+
+  if ((Stop->getAddress() - StaticKeysJumpTableAddress) %
+      STATIC_KEYS_JUMP_ENTRY_SIZE)
+    return createStringError(errc::executable_format_error,
+                             "static keys jump table size error");
+
+  const uint64_t SectionAddress = StaticKeysJumpSection->getAddress();
+  DataExtractor DE(StaticKeysJumpSection->getContents(),
+                   BC.AsmInfo->isLittleEndian(),
+                   BC.AsmInfo->getCodePointerSize());
+  DataExtractor::Cursor Cursor(StaticKeysJumpTableAddress - SectionAddress);
+  uint32_t EntryID = 0;
+  while (Cursor && Cursor.tell() < Stop->getAddress() - SectionAddress) {
+    const uint64_t JumpAddress =
+        SectionAddress + Cursor.tell() + (int32_t)DE.getU32(Cursor);
+    const uint64_t TargetAddress =
+        SectionAddress + Cursor.tell() + (int32_t)DE.getU32(Cursor);
+    const uint64_t KeyAddress =
+        SectionAddress + Cursor.tell() + (int64_t)DE.getU64(Cursor);
+
+    // Consume the status of the cursor.
+    if (!Cursor)
+      return createStringError(
+          errc::executable_format_error,
+          "out of bounds while reading static keys jump table: %s",
+          toString(Cursor.takeError()).c_str());
+
+    ++EntryID;
+
+    JumpInfo.push_back(JumpInfoEntry());
+    JumpInfoEntry &Info = JumpInfo.back();
+    Info.Likely = KeyAddress & 1;
+
+    if (opts::DumpStaticKeys) {
+      BC.outs() << "Static key jump entry: " << EntryID
+                << "\n\tJumpAddress:   0x" << Twine::utohexstr(JumpAddress)
+                << "\n\tTargetAddress: 0x" << Twine::utohexstr(TargetAddress)
+                << "\n\tKeyAddress:    0x" << Twine::utohexstr(KeyAddress)
+                << "\n\tIsLikely:      " << Info.Likely << '\n';
+    }
+
+    BinaryFunction *BF = BC.getBinaryFunctionContainingAddress(JumpAddress);
+    if (!BF && opts::Verbosity) {
+      BC.outs()
+          << "BOLT-INFO: no function matches address 0x"
+          << Twine::utohexstr(JumpAddress)
+          << " of jump instruction referenced from static keys jump table\n";
+    }
+
+    if (!BF || !BC.shouldEmit(*BF))
+      continue;
+
+    MCInst *Inst = BF->getInstructionAtOffset(JumpAddress - BF->getAddress());
+    if (!Inst)
+      return createStringError(
+          errc::executable_format_error,
+          "no instruction at static keys jump site address 0x%" PRIx64,
+          JumpAddress);
+
+    if (!BF->containsAddress(TargetAddress))
+      return createStringError(
+          errc::executable_format_error,
+          "invalid target of static keys jump at 0x%" PRIx64 " : 0x%" PRIx64,
+          JumpAddress, TargetAddress);
+
+    const bool IsBranch = BC.MIB->isBranch(*Inst);
+    if (!IsBranch && !BC.MIB->isNoop(*Inst))
+      return createStringError(errc::executable_format_error,
+                               "jump or nop expected at address 0x%" PRIx64,
+                               JumpAddress);
+
+    const uint64_t Size = BC.computeInstructionSize(*Inst);
+    if (Size != 2 && Size != 5) {
+      return createStringError(
+          errc::executable_format_error,
+          "unexpected static keys jump size at address 0x%" PRIx64,
+          JumpAddress);
+    }
+
+    MCSymbol *Target = BF->registerBranch(JumpAddress, TargetAddress);
+    MCInst StaticKeyBranch;
+
+    // Create a conditional branch instruction. The actual conditional code type
+    // should not matter as long as it's a valid code. The instruction should be
+    // treated as a conditional branch for control-flow purposes. Before we emit
+    // the code, it will be converted to a different instruction in
+    // rewriteStaticKeysJumpTable().
+    //
+    // NB: for older kernels, under LongJumpLabels option, we create long
+    //     conditional branch to guarantee that code size estimation takes
+    //     into account the extra bytes needed for long branch that will be used
+    //     by the kernel patching code. Newer kernels can work with both short
+    //     and long branches. The code for long conditional branch is larger
+    //     than unconditional one, so we are pessimistic in our estimations.
+    if (opts::LongJumpLabels)
+      BC.MIB->createLongCondBranch(StaticKeyBranch, Target, 0, BC.Ctx.get());
+    else
+      BC.MIB->createCondBranch(StaticKeyBranch, Target, 0, BC.Ctx.get());
+    BC.MIB->moveAnnotations(std::move(*Inst), StaticKeyBranch);
+    BC.MIB->setDynamicBranch(StaticKeyBranch, EntryID);
+    *Inst = StaticKeyBranch;
+
+    // IsBranch = InitialValue ^ LIKELY
+    //
+    //    0 0 0
+    //    1 0 1
+    //    1 1 0
+    //    0 1 1
+    //
+    // => InitialValue = IsBranch ^ LIKELY
+    Info.InitValue = IsBranch ^ Info.Likely;
+
+    // Add annotations to facilitate manual code analysis.
+    BC.MIB->addAnnotation(*Inst, "Likely", Info.Likely);
+    BC.MIB->addAnnotation(*Inst, "InitValue", Info.InitValue);
+    if (!BC.MIB->getSize(*Inst))
+      BC.MIB->setSize(*Inst, Size);
+
+    if (opts::LongJumpLabels)
+      BC.MIB->setSize(*Inst, 5);
+  }
+
+  BC.outs() << "BOLT-INFO: parsed " << EntryID << " static keys jump entries\n";
+
+  return Error::success();
+}
+
+// Pre-emit pass. Convert dynamic branch instructions into jumps that could be
+// relaxed. In post-emit pass we will convert those jumps into nops when
+// necessary. We do the unconditional conversion into jumps so that the jumps
+// can be relaxed and the optimal size of jump/nop instruction is selected.
+Error LinuxKernelRewriter::rewriteStaticKeysJumpTable() {
+  if (!StaticKeysJumpSection)
+    return Error::success();
+
+  uint64_t NumShort = 0;
+  uint64_t NumLong = 0;
+  for (BinaryFunction &BF : llvm::make_second_range(BC.getBinaryFunctions())) {
+    if (!BC.shouldEmit(BF))
+      continue;
+
+    for (BinaryBasicBlock &BB : BF) {
+      for (MCInst &Inst : BB) {
+        if (!BC.MIB->isDynamicBranch(Inst))
+          continue;
+
+        const uint32_t EntryID = *BC.MIB->getDynamicBranchID(Inst);
+        MCSymbol *Target =
+            const_cast<MCSymbol *>(BC.MIB->getTargetSymbol(Inst));
+        assert(Target && "Target symbol should be set.");
+
+        const JumpInfoEntry &Info = JumpInfo[EntryID - 1];
+        const bool IsBranch = Info.Likely ^ Info.InitValue;
+
+        uint32_t Size = *BC.MIB->getSize(Inst);
+        if (Size == 2)
+          ++NumShort;
+        else if (Size == 5)
+          ++NumLong;
+        else
+          llvm_unreachable("Wrong size for static keys jump instruction.");
+
+        MCInst NewInst;
+        // Replace the instruction with unconditional jump even if it needs to
+        // be nop in the binary.
+        if (opts::LongJumpLabels) {
+          BC.MIB->createLongUncondBranch(NewInst, Target, BC.Ctx.get());
+        } else {
+          // Newer kernels can handle short and long jumps for static keys.
+          // Optimistically, emit short jump and check if it gets relaxed into
+          // a long one during post-emit. Only then convert the jump to a nop.
+          BC.MIB->createUncondBranch(NewInst, Target, BC.Ctx.get());
+        }
+
+        BC.MIB->moveAnnotations(std::move(Inst), NewInst);
+        Inst = NewInst;
+
+        // Mark the instruction for nop conversion.
+        if (!IsBranch)
+          NopIDs.insert(EntryID);
+
+        MCSymbol *Label =
+            BC.MIB->getOrCreateInstLabel(Inst, "__SK_", BC.Ctx.get());
+
+        // Create a relocation against the label.
+        const uint64_t EntryOffset = StaticKeysJumpTableAddress -
+                                     StaticKeysJumpSection->getAddress() +
+                                     (EntryID - 1) * 16;
+        StaticKeysJumpSection->addRelocation(EntryOffset, Label,
+                                             ELF::R_X86_64_PC32,
+                                             /*Addend*/ 0);
+        StaticKeysJumpSection->addRelocation(EntryOffset + 4, Target,
+                                             ELF::R_X86_64_PC32, /*Addend*/ 0);
+      }
+    }
+  }
+
+  BC.outs() << "BOLT-INFO: the input contains " << NumShort << " short and "
+            << NumLong << " long static keys jumps in optimized functions\n";
+
+  return Error::success();
+}
+
+// Post-emit pass of static keys jump section. Convert jumps to nops.
+Error LinuxKernelRewriter::updateStaticKeysJumpTablePostEmit() {
+  if (!StaticKeysJumpSection || !StaticKeysJumpSection->isFinalized())
+    return Error::success();
+
+  const uint64_t SectionAddress = StaticKeysJumpSection->getAddress();
+  DataExtractor DE(StaticKeysJumpSection->getOutputContents(),
+                   BC.AsmInfo->isLittleEndian(),
+                   BC.AsmInfo->getCodePointerSize());
+  DataExtractor::Cursor Cursor(StaticKeysJumpTableAddress - SectionAddress);
+  const BinaryData *Stop = BC.getBinaryDataByName("__stop___jump_table");
+  uint32_t EntryID = 0;
+  uint64_t NumShort = 0;
+  uint64_t NumLong = 0;
+  while (Cursor && Cursor.tell() < Stop->getAddress() - SectionAddress) {
+    const uint64_t JumpAddress =
+        SectionAddress + Cursor.tell() + (int32_t)DE.getU32(Cursor);
+    const uint64_t TargetAddress =
+        SectionAddress + Cursor.tell() + (int32_t)DE.getU32(Cursor);
+    const uint64_t KeyAddress =
+        SectionAddress + Cursor.tell() + (int64_t)DE.getU64(Cursor);
+
+    // Consume the status of the cursor.
+    if (!Cursor)
+      return createStringError(errc::executable_format_error,
+                               "out of bounds while updating static keys: %s",
+                               toString(Cursor.takeError()).c_str());
+
+    ++EntryID;
+
+    LLVM_DEBUG({
+      dbgs() << "\n\tJumpAddress:   0x" << Twine::utohexstr(JumpAddress)
+             << "\n\tTargetAddress: 0x" << Twine::utohexstr(TargetAddress)
+             << "\n\tKeyAddress:    0x" << Twine::utohexstr(KeyAddress) << '\n';
+    });
+
+    BinaryFunction *BF =
+        BC.getBinaryFunctionContainingAddress(JumpAddress,
+                                              /*CheckPastEnd*/ false,
+                                              /*UseMaxSize*/ true);
+    assert(BF && "Cannot get function for modified static key.");
+
+    if (!BF->isEmitted())
+      continue;
+
+    // Disassemble instruction to collect stats even if nop-conversion is
+    // unnecessary.
+    MutableArrayRef<uint8_t> Contents = MutableArrayRef<uint8_t>(
+        reinterpret_cast<uint8_t *>(BF->getImageAddress()), BF->getImageSize());
+    assert(Contents.size() && "Non-empty function image expected.");
+
+    MCInst Inst;
+    uint64_t Size;
+    const uint64_t JumpOffset = JumpAddress - BF->getAddress();
+    if (!BC.DisAsm->getInstruction(Inst, Size, Contents.slice(JumpOffset), 0,
+                                   nulls())) {
+      llvm_unreachable("Unable to disassemble jump instruction.");
+    }
+    assert(BC.MIB->isBranch(Inst) && "Branch instruction expected.");
+
+    if (Size == 2)
+      ++NumShort;
+    else if (Size == 5)
+      ++NumLong;
+    else
+      llvm_unreachable("Unexpected size for static keys jump instruction.");
+
+    // Check if we need to convert jump instruction into a nop.
+    if (!NopIDs.contains(EntryID))
+      continue;
+
+    SmallString<15> NopCode;
+    raw_svector_ostream VecOS(NopCode);
+    BC.MAB->writeNopData(VecOS, Size, BC.STI.get());
+    for (uint64_t I = 0; I < Size; ++I)
+      Contents[JumpOffset + I] = NopCode[I];
+  }
+
+  BC.outs() << "BOLT-INFO: written " << NumShort << " short and " << NumLong
+            << " long static keys jumps in optimized functions\n";
+
+  return Error::success();
+}
+
 } // namespace
 
 std::unique_ptr<MetadataRewriter>
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index cde195c17390..03f4298e817d 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -199,10 +199,7 @@ static cl::opt<cl::boolOrDefault> RelocationMode(
     "relocs", cl::desc("use relocations in the binary (default=autodetect)"),
     cl::cat(BoltCategory));
 
-static cl::opt<std::string>
-SaveProfile("w",
-  cl::desc("save recorded profile to a file"),
-  cl::cat(BoltOutputCategory));
+extern cl::opt<std::string> SaveProfile;
 
 static cl::list<std::string>
 SkipFunctionNames("skip-funcs",
@@ -732,6 +729,13 @@ Error RewriteInstance::run() {
   // Skip disassembling if we have a translation table and we are running an
   // aggregation job.
   if (opts::AggregateOnly && BAT->enabledFor(InputFile)) {
+    // YAML profile in BAT mode requires CFG for .bolt.org.text functions
+    if (!opts::SaveProfile.empty() ||
+        opts::ProfileFormat == opts::ProfileFormatKind::PF_YAML) {
+      selectFunctionsToProcess();
+      disassembleFunctions();
+      buildFunctionsCFG();
+    }
     processProfileData();
     return Error::success();
   }
@@ -2027,14 +2031,6 @@ void RewriteInstance::adjustCommandLineOptions() {
 
   if (opts::Lite)
     BC->outs() << "BOLT-INFO: enabling lite mode\n";
-
-  if (!opts::SaveProfile.empty() && BAT->enabledFor(InputFile)) {
-    BC->errs()
-        << "BOLT-ERROR: unable to save profile in YAML format for input "
-           "file processed by BOLT. Please remove -w option and use branch "
-           "profile.\n";
-    exit(1);
-  }
 }
 
 namespace {
@@ -3126,12 +3122,13 @@ void RewriteInstance::processProfileData() {
     }
   }
 
-  if (!opts::SaveProfile.empty()) {
+  if (!opts::SaveProfile.empty() && !BAT->enabledFor(InputFile)) {
     YAMLProfileWriter PW(opts::SaveProfile);
     PW.writeProfile(*this);
   }
   if (opts::AggregateOnly &&
-      opts::ProfileFormat == opts::ProfileFormatKind::PF_YAML) {
+      opts::ProfileFormat == opts::ProfileFormatKind::PF_YAML &&
+      !BAT->enabledFor(InputFile)) {
     YAMLProfileWriter PW(opts::OutputFilename);
     PW.writeProfile(*this);
   }
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index de55fbe51764..15f95f821777 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -336,6 +336,9 @@ public:
   }
 
   bool isUnsupportedBranch(const MCInst &Inst) const override {
+    if (isDynamicBranch(Inst))
+      return true;
+
     switch (Inst.getOpcode()) {
     default:
       return false;
@@ -2728,6 +2731,7 @@ public:
 
   void createUncondBranch(MCInst &Inst, const MCSymbol *TBB,
                           MCContext *Ctx) const override {
+    Inst.clear();
     Inst.setOpcode(X86::JMP_1);
     Inst.clear();
     Inst.addOperand(MCOperand::createExpr(
@@ -2776,6 +2780,15 @@ public:
     Inst.addOperand(MCOperand::createImm(CC));
   }
 
+  void createLongCondBranch(MCInst &Inst, const MCSymbol *Target, unsigned CC,
+                            MCContext *Ctx) const override {
+    Inst.setOpcode(X86::JCC_4);
+    Inst.clear();
+    Inst.addOperand(MCOperand::createExpr(
+        MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx)));
+    Inst.addOperand(MCOperand::createImm(CC));
+  }
+
   bool reverseBranchCondition(MCInst &Inst, const MCSymbol *TBB,
                               MCContext *Ctx) const override {
     unsigned InvCC = getInvertedCondCode(getCondCode(Inst));
diff --git a/bolt/lib/Utils/CommandLineOpts.cpp b/bolt/lib/Utils/CommandLineOpts.cpp
index e910fa4f8672..ba296c10c00a 100644
--- a/bolt/lib/Utils/CommandLineOpts.cpp
+++ b/bolt/lib/Utils/CommandLineOpts.cpp
@@ -162,6 +162,10 @@ cl::opt<ProfileFormatKind> ProfileFormat(
                clEnumValN(PF_YAML, "yaml", "dense YAML representation")),
     cl::ZeroOrMore, cl::Hidden, cl::cat(BoltCategory));
 
+cl::opt<std::string> SaveProfile("w",
+                                 cl::desc("save recorded profile to a file"),
+                                 cl::cat(BoltOutputCategory));
+
 cl::opt<bool> SplitEH("split-eh", cl::desc("split C++ exception handling code"),
                       cl::Hidden, cl::cat(BoltOptCategory));
 
diff --git a/bolt/test/X86/Inputs/blarge_new.preagg.txt b/bolt/test/X86/Inputs/blarge_new.preagg.txt
new file mode 100644
index 000000000000..e92f356d9188
--- /dev/null
+++ b/bolt/test/X86/Inputs/blarge_new.preagg.txt
@@ -0,0 +1,81 @@
+B 40164b 401608 109 0
+B 401611 4017e0 115 0
+B 4017f0 401616 117 0
+B 401ba2 4015da 6 0
+B 4015d5 401b60 1 0
+B 40159a 401b60 5 0
+B 401b9d 401b70 615 2
+B 401b90 401b99 344 37
+B 401ba2 40159f 8 0
+B 4015b0 401070 9 0
+B 401544 4014a0 6 0
+B 40188a 401928 5 0
+B 40152a 4014b0 21 0
+B 40169e 40165b 2 0
+B 4014dd 401070 12 1
+B 401509 4014ec 2 2
+B 401510 401030 673 0
+B 4019de 401080 1 0
+B 401500 401070 22 0
+B 401921 4014d6 9 0
+B 4019b3 401080 3 0
+B 40162d 401070 113 0
+B 4014d1 401800 27 0
+B 401a3f 401080 1 0
+B 4018d2 401050 17 0
+B 401664 4017c0 2 0
+B 401680 401070 2 0
+B 4017d0 401669 2 0
+B 4018f7 40190d 9 0
+B 4015bc 401592 6 0
+B 401964 401090 5 0
+B 4015f8 4015cd 1 0
+B 4015ec 401070 6 0
+F 40165b 401664 2
+F 4017c0 4017d0 2
+F 401669 401680 2
+F 40190d 401921 9
+F 4014d6 4014dd 9
+F 401800 4018d2 17
+F 4018d7 4018f7 9
+F 40159f 4015b0 8
+F 401515 401544 6
+F 401070 401500 1
+F 401070 401070 157
+F 4014a0 4014d1 6
+F 401616 40162d 112
+F 4019e3 401a3f 1
+F 4014e2 401500 19
+F 401090 401090 5
+F 401030 401030 673
+F 401505 401510 668
+F 401616 4017f0 2
+F 401070 4015b0 1
+F 4015da 4015ec 6
+F 401b60 401b90 6
+F 4019b8 4019de 1
+F 401969 4019b3 3
+F 401505 401509 2
+F 401515 40152a 21
+F 401592 40159a 4
+F 401050 401050 17
+F 4015cd 4015d5 1
+F 401070 4014dd 1
+F 401b99 401ba2 8
+F 401b70 401b90 326
+F 401b99 401b9d 324
+F 401592 4015bc 1
+F 401608 401611 109
+F 401b70 401b9d 268
+F 4015b5 4015bc 5
+F 401b99 401b90 1
+F 401b70 401ba2 5
+F 401632 40164b 108
+F 401080 401080 5
+F 4014b0 4014d1 21
+F 4017e0 4017f0 115
+F 4015f1 4015f8 1
+F 401685 40169e 2
+F 401928 401964 5
+F 401800 40188a 5
+F 4014ec 401500 2
diff --git a/bolt/test/X86/Inputs/blarge_new.yaml b/bolt/test/X86/Inputs/blarge_new.yaml
new file mode 100644
index 000000000000..0380f5180e90
--- /dev/null
+++ b/bolt/test/X86/Inputs/blarge_new.yaml
@@ -0,0 +1,1648 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+  Entry:           0x4016D0
+ProgramHeaders:
+  - Type:            PT_PHDR
+    Flags:           [ PF_R ]
+    VAddr:           0x400040
+    Align:           0x8
+    Offset:          0x40
+  - Type:            PT_INTERP
+    Flags:           [ PF_R ]
+    FirstSec:        .interp
+    LastSec:         .interp
+    VAddr:           0x4002A8
+    Offset:          0x2A8
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .interp
+    LastSec:         .rela.plt
+    VAddr:           0x400000
+    Align:           0x1000
+    Offset:          0x0
+  - Type:            PT_LOAD
+    Flags:           [ PF_X, PF_R ]
+    FirstSec:        .init
+    LastSec:         .fini
+    VAddr:           0x401000
+    Align:           0x1000
+    Offset:          0x1000
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    FirstSec:        .rodata
+    LastSec:         .eh_frame
+    VAddr:           0x402000
+    Align:           0x1000
+    Offset:          0x2000
+  - Type:            PT_LOAD
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .bss
+    VAddr:           0x403E00
+    Align:           0x1000
+    Offset:          0x2E00
+  - Type:            PT_DYNAMIC
+    Flags:           [ PF_W, PF_R ]
+    FirstSec:        .dynamic
+    LastSec:         .dynamic
+    VAddr:           0x403E10
+    Align:           0x8
+    Offset:          0x2E10
+  - Type:            PT_NOTE
+    Flags:           [ PF_R ]
+    FirstSec:        .note.gnu.build-id
+    LastSec:         .note.ABI-tag
+    VAddr:           0x4002C4
+    Align:           0x4
+    Offset:          0x2C4
+  - Type:            PT_GNU_EH_FRAME
+    Flags:           [ PF_R ]
+    FirstSec:        .eh_frame_hdr
+    LastSec:         .eh_frame_hdr
+    VAddr:           0x402270
+    Align:           0x4
+    Offset:          0x2270
+  - Type:            PT_GNU_STACK
+    Flags:           [ PF_W, PF_R ]
+    Align:           0x10
+    Offset:          0x0
+  - Type:            PT_GNU_RELRO
+    Flags:           [ PF_R ]
+    FirstSec:        .init_array
+    LastSec:         .got
+    VAddr:           0x403E00
+    Offset:          0x2E00
+Sections:
+  - Name:            .interp
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x4002A8
+    AddressAlign:    0x1
+    Content:         2F6C696236342F6C642D6C696E75782D7838362D36342E736F2E3200
+  - Name:            .note.gnu.build-id
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x4002C4
+    AddressAlign:    0x4
+    Notes:
+      - Name:            GNU
+        Desc:            66CF856212C3B313EA98AD840984B20EA781118A
+        Type:            NT_PRPSINFO
+  - Name:            .note.ABI-tag
+    Type:            SHT_NOTE
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x4002E8
+    AddressAlign:    0x4
+    Notes:
+      - Name:            GNU
+        Desc:            '00000000030000000200000000000000'
+        Type:            NT_VERSION
+  - Name:            .gnu.hash
+    Type:            SHT_GNU_HASH
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x400308
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Header:
+      SymNdx:          0x1
+      Shift2:          0x0
+    BloomFilter:     [ 0x0 ]
+    HashBuckets:     [ 0x0 ]
+    HashValues:      [  ]
+  - Name:            .dynsym
+    Type:            SHT_DYNSYM
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x400328
+    Link:            .dynstr
+    AddressAlign:    0x8
+  - Name:            .dynstr
+    Type:            SHT_STRTAB
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x400430
+    AddressAlign:    0x1
+  - Name:            .gnu.version
+    Type:            SHT_GNU_versym
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x4004BA
+    Link:            .dynsym
+    AddressAlign:    0x2
+    Entries:         [ 0, 2, 2, 3, 4, 2, 5, 5, 2, 0, 5 ]
+  - Name:            .gnu.version_r
+    Type:            SHT_GNU_verneed
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x4004D0
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Dependencies:
+      - Version:         1
+        File:            libm.so.6
+        Entries:
+          - Name:            GLIBC_2.2.5
+            Hash:            157882997
+            Flags:           0
+            Other:           5
+          - Name:            GLIBC_2.29
+            Hash:            110530953
+            Flags:           0
+            Other:           3
+      - Version:         1
+        File:            libc.so.6
+        Entries:
+          - Name:            GLIBC_2.4
+            Hash:            225011988
+            Flags:           0
+            Other:           4
+          - Name:            GLIBC_2.2.5
+            Hash:            157882997
+            Flags:           0
+            Other:           2
+  - Name:            .rela.dyn
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x400530
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Relocations:
+      - Offset:          0x403FF0
+        Symbol:          __libc_start_main
+        Type:            R_X86_64_GLOB_DAT
+      - Offset:          0x403FF8
+        Symbol:          __gmon_start__
+        Type:            R_X86_64_GLOB_DAT
+  - Name:            .rela.plt
+    Type:            SHT_RELA
+    Flags:           [ SHF_ALLOC, SHF_INFO_LINK ]
+    Address:         0x400560
+    Link:            .dynsym
+    AddressAlign:    0x8
+    Info:            .got.plt
+    Relocations:
+      - Offset:          0x404018
+        Symbol:          putchar
+        Type:            R_X86_64_JUMP_SLOT
+      - Offset:          0x404020
+        Symbol:          puts
+        Type:            R_X86_64_JUMP_SLOT
+      - Offset:          0x404028
+        Symbol:          pow
+        Type:            R_X86_64_JUMP_SLOT
+      - Offset:          0x404030
+        Symbol:          __stack_chk_fail
+        Type:            R_X86_64_JUMP_SLOT
+      - Offset:          0x404038
+        Symbol:          printf
+        Type:            R_X86_64_JUMP_SLOT
+      - Offset:          0x404040
+        Symbol:          cos
+        Type:            R_X86_64_JUMP_SLOT
+      - Offset:          0x404048
+        Symbol:          acos
+        Type:            R_X86_64_JUMP_SLOT
+      - Offset:          0x404050
+        Symbol:          sqrt
+        Type:            R_X86_64_JUMP_SLOT
+  - Name:            .init
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x401000
+    AddressAlign:    0x4
+    Offset:          0x1000
+    Content:         F30F1EFA4883EC08488B05E92F00004885C07402FFD04883C408C3
+  - Name:            .plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x401020
+    AddressAlign:    0x10
+    EntSize:         0x10
+    Content:         FF35E22F0000FF25E42F00000F1F4000FF25E22F00006800000000E9E0FFFFFFFF25DA2F00006801000000E9D0FFFFFFFF25D22F00006802000000E9C0FFFFFFFF25CA2F00006803000000E9B0FFFFFFFF25C22F00006804000000E9A0FFFFFFFF25BA2F00006805000000E990FFFFFFFF25B22F00006806000000E980FFFFFFFF25AA2F00006807000000E970FFFFFF
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x4010B0
+    AddressAlign:    0x10
+    Content:         4156BF082040004155415455534883EC5064488B042528000000488944244831C0E86AFFFFFF488D742430488D7C2424488B0529100000F20F101529100000F20F100D2910000066480F6ED8488B050510000066480F6EC0E8F3060000BFBF20400031C0E857FFFFFF448B5C24244585DB7E2131DBF20F1044DC30BFCA204000B8010000004883C301E832FFFFFF395C24247FE1BF0A000000E8E2FEFFFF488D742430488D7C2424488B05B10F0000F20F1015C10F0000F20F100DC10F000066480F6ED8488B058D0F000066480F6EC0E87B060000BFBF20400031C0E8DFFEFFFF448B5424244585D27E2131DBF20F1044DC30BFCA204000B8010000004883C301E8BAFEFFFF395C24247FE1BF0A000000E86AFEFFFF488B053B0F0000F20F101D630F0000488D742430F20F10155E0F0000F20F100D5E0F0000488D7C242466480F6EC0E807060000BFBF20400031C0E86BFEFFFF448B4C24244585C97E2131DBF20F1044DC30BFCA204000B8010000004883C301E846FEFFFF395C24247FE1BF0A000000E8F6FDFFFF488D742430488D7C2424488B05BD0E0000F20F101DFD0E0000F20F100DFD0E000066480F6ED066480F6EC0E896050000BFBF20400031C0E8FAFDFFFF448B4424244585C07E2131DBF20F1044DC30BFCA204000B8010000004883C301E8D5FDFFFF395C24247FE1BF0A000000E885FDFFFF488B05460E0000F20F101DA60E0000488D742430F20F100DA10E0000F20F1005A10E0000488D7C242466480F6ED0E822050000BFBF20400031C0E886FDFFFF8B7C242485FF7E2131DBF20F1044DC30BFCA204000B8010000004883C301E863FDFFFF395C24247FE1BF0A000000E813FDFFFFF20F101D530E0000F20F1015530E0000488D742430F20F100D4E0E0000F20F10054E0E0000488D7C2424E8B4040000BFBF20400031C0E818FDFFFF8B74242485F67E2131DBF20F1044DC30BFCA204000B8010000004883C301E8F5FCFFFF395C24247FE1BF0A000000E8A5FCFFFFF20F101D050E0000F20F1015050E0000488D742430F20F100D000E0000F20F1005000E0000488D7C2424E846040000BFBF20400031C0E8AAFCFFFF8B4C242485C97E2131DBF20F1044DC30BFCA204000B8010000004883C301E887FCFFFF395C24247FE1BF0A000000E837FCFFFFF20F101DB70D0000F20F1015B70D0000488D742430F20F100DB20D0000F20F1005B20D0000488D7C2424E8D8030000BFBF20400031C0E83CFCFFFF8B54242485D27E2131DBF20F1044DC30BFCA204000B8010000004883C301E819FCFFFF395C24247FE1BF0A00000041BD09000000E8C3FBFFFF488B05940C00004889442410488B05800C000041BE280000004889442418488B05660C000041BC1100000048894424080F1F00488B05490C0000BD0900000048890424F20F101C24488D742430488D7C2424F20F10542408F20F104C2418F20F10442410E82A030000BFBF20400031C0E88EFBFFFF8B44242485C07E2131DBF20F1044DC30BFCA204000B8010000004883C301E86BFBFFFF395C24247FE1BF0A000000E81BFBFFFFF20F102424F20F5C25B60C0000F20F11242483ED017584F20F102DAC0C0000F20F586C2408F20F116C24084183EC010F8556FFFFFFF20F107C2418F20F5C3D900C0000F20F117C24184183EE010F8523FFFFFFF20F103D980B0000F20F587C2410F20F117C24104183ED010F85F3FEFFFFBF3020400031DBE8AEFAFFFF4889DF488D742428E8C10500008B54242889DEBFCE20400031C04883C302E8BBFAFFFF4881FBA086010075D4BF0A000000BB6901ED3FE863FAFFFF4889DF488D742428E8860500008B5424284889DE31C0BFDF2040004883C301E87FFAFFFF4881FB6941ED3F75D3BF58204000E83CFAFFFF660FEFD2660F28C2F20F111424E8CA010000F20F101424BF80204000B802000000660F28C8660F28C2E83EFAFFFFF20F101424F20F5815B10B0000F20F103DB10B0000660F2FFA73BBBFEE204000E8E9F9FFFF660FEFD2660F28C2F20F111424E857010000F20F101424BFA0204000B802000000660F28C8660F28C2E8EBF9FFFFF20F101424F20F58156E0B0000F20F103D6E0B0000660F2FFA73BB488B442448644833042528000000750F4883C45031C05B5D415C415D415EC3E89CF9FFFF662E0F1F8400000000006690F30F1EFA31ED4989D15E4889E24883E4F0505449C7C0201C400048C7C1B01B400048C7C7B0104000FF15F2280000F490F30F1EFAC3662E0F1F84000000000090B868404000483D684040007413B8000000004885C07409BF68404000FFE06690C30F1F440000662E0F1F840000000000BE684040004881EE684040004889F048C1EE3F48C1F8034801C648D1FE7411B8000000004885C07407BF68404000FFE0C30F1F440000662E0F1F840000000000803DE1280000007517554889E5E87EFFFFFFC605CF280000015DC30F1F440000C30F1F440000662E0F1F840000000000EB8E662E0F1F8400000000000F1F4000F20F5905480A0000F20F5E05480A0000C366662E0F1F8400000000000F1F4000F20F5905300A0000F20F5E05200A0000C3662E0F1F8400000000000F1F440000F20F5EC8534889F34883EC50F20F5ED0F20F110C24DD0424660FEFC9DB3C24DB2C24F20F5ED8F20F11542418DD442418D9C1D8CAD905E6090000D8CADEE9D905E0090000DCF9F20F115C2418D9C3D8C4D8CCD8CCD9CCDEC9DECAD9CADEE1D905C4090000DC4C2418DEC1D835BC090000D9C1D8CAD8CAD9C1D8CAD8E1DD5C2418F20F10442418660F2FC80F8398000000DDD8660F2EC8660F28D0C70701000000F20F51D20F87B6010000D9C9DB7C2430F20F100D90090000DD542418F20F10442418660F540586090000DB7C2420F20F58C2E879F7FFFFF20F11442418DD442418DB6C2430D8F1DEC1DD5C2418DB6C2420D9EEDFF1DDD87714F20F107C2418660F573D59090000F20F117C2418DB2C24D8350A090000DC6C2418DD1B4883C4505BC3660F1F440000DD5C2418F20F10442418C70703000000660F28F0660F2EC8F20F51F6F20F117424180F8736010000D9C9DB7C2420DC742418DD5C2418F20F10442418E827F7FFFFDB6C2420660FEFC9F20F11442418DD5C2420F20F10542420660F2ECA660F28DAF20F51DB0F870D010000F20F102DD5070000F20F591D8D080000F20F5EC5F20F116C2430F20F115C2420E8C8F6FFFFDB2C24F20F11442440F20F10442418D83553080000F20F580563080000F20F5E442430DB3C24E89DF6FFFFF20F104C2440F20F10642420DB2C24660F28D0F20F59CCF20F59D4D9C0F20F114C2440DC6C2440DD5C2440F20F10442440F20F11542440DC6C2440DD5C2440660F164424400F1103F20F10442418F20F580507080000F20F5E442430E83CF6FFFFF20F59442420DB2C24F20F11442418DC6C2418DD5B104883C4505BC3DB7C2430F20F11542418DB7C2420E82DF6FFFFF20F10542418DB6C2430DB6C2420E926FEFFFFDB7C2430DB7C2420E80DF6FFFFDB6C2430DB6C2420E9B2FEFFFF660F28C2F20F11542448F20F115C2420E8EBF5FFFFF20F103DB3060000F20F10442418F20F105C2420F20F591D5F070000F20F5EC7F20F117C2430F20F115C2420E89AF5FFFFDB2C24F20F10742420F20F10542448D83525070000F20F59F0660F28C2F20F11742440D9C0DB3C24DC6C2440DD1BE887F5FFFFF20F10442418F20F580511070000F20F5E442430E84EF5FFFFDB2C24F20F10542448F20F59442420F20F11442440DC6C2440660F28C2DD5B08E849F5FFFFE9CFFEFFFF0F1F400041B82000000031C031D2660F1F4400004889F948C1E70248C1E91E83E103488D1491488D0C85010000004801C04839CA72074829CA4883C0014183E80175D1488906C3662E0F1F8400000000000F1F00F30F1EFA41574C8D3D4322000041564989D641554989F541544189FC55488D2D34220000534C29FD4883EC08E81FF4FFFF48C1FD03741F31DB0F1F80000000004C89F24C89EE4489E741FF14DF4883C3014839DD75EA4883C4085B5D415C415D415E415FC366662E0F1F840000000000F30F1EFAC3
+  - Name:            .fini
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x401C28
+    AddressAlign:    0x4
+    Content:         F30F1EFA4883EC084883C408C3
+  - Name:            .rodata
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x402000
+    AddressAlign:    0x10
+    Offset:          0x2000
+    Content:         01000200000000002A2A2A2A2A2A2A2A2A2043554249432046554E4354494F4E53202A2A2A2A2A2A2A2A2A2A2A0000002A2A2A2A2A2A2A2A2A20494E54454745522053515220524F4F5453202A2A2A2A2A2A2A2A2A2A2A002A2A2A2A2A2A2A2A2A20414E474C4520434F4E56455253494F4E202A2A2A2A2A2A2A2A2A2A2A000025332E30662064656772656573203D20252E3132662072616469616E730A0000252E3132662072616469616E73203D2025332E306620646567726565730A00536F6C7574696F6E733A0020256600737172742825336429203D202532640A007371727428256C5829203D2025580A0000000000000000F0BF00000000000014400000000000002440000000000000F03F0000000000003EC0000000000000404000000000000025C0000000000000314000000000000012C00000000000003FC000000000000036400000000000000CC000000000008041C06666666666662BC00000000000002840AE47E17A14AE284000000000000008409A999999999937C00000000000001840295C8FC2F5F850C000000000000020C000000000000041400000000000001E40D7A3703D0A572140000000000080464000000000000030403333333333331540333333333333FBBF00000000000028C077BE9F1A2FDDDC3F85EB51B81E85E33F000000000000D03FFCA9F1D24D62503F0000000000807640399D52A246DF413F9B0B6097FB2119400000000000806640182D4454FB21094000004040000010410000D8410000584200000000000000C0182D4454FB211940182D4454FB212940555555555555D53FFFFFFFFFFFFFFF7F000000000000000000000000000000800000000000000000
+  - Name:            .eh_frame_hdr
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x402270
+    AddressAlign:    0x4
+    Content:         011B033B5C0000000A000000B0EDFFFFA000000040EEFFFFC800000060F4FFFF7800000090F4FFFF8C00000050F5FFFF1001000070F5FFFF2401000090F5FFFF38010000F0F8FFFF6801000040F9FFFF80010000B0F9FFFFC8010000
+  - Name:            .eh_frame
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x4022D0
+    AddressAlign:    0x8
+    Content:         1400000000000000017A5200017810011B0C070890010000100000001C000000E0F3FFFF2F000000004407101000000030000000FCF3FFFF0500000000000000240000004400000008EDFFFF90000000000E10460E184A0F0B770880003F1A3B2A33242200000000440000006C00000070EDFFFF1406000000420E108E02470E188D03420E208C04410E288605410E308306440E800103F3050A0E30430E28410E20420E18420E10420E08410B00000010000000B400000038F4FFFF110000000000000010000000C800000044F4FFFF11000000000000002C000000DC00000050F4FFFF5C03000000450E108302470E600314010A0E10410E08470B0336010A0E10410E08410B00140000000C01000080F7FFFF4300000000000000000000004400000024010000B8F7FFFF6500000000460E108F02490E188E03450E208D04450E288C05440E308606480E388307470E406E0E38410E30410E28420E20420E18420E10420E0800100000006C010000E0F7FFFF050000000000000000000000
+  - Name:            .init_array
+    Type:            SHT_INIT_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x403E00
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Offset:          0x2E00
+    Content:         B017400000000000
+  - Name:            .fini_array
+    Type:            SHT_FINI_ARRAY
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x403E08
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '8017400000000000'
+  - Name:            .dynamic
+    Type:            SHT_DYNAMIC
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x403E10
+    Link:            .dynstr
+    AddressAlign:    0x8
+    Entries:
+      - Tag:             DT_NEEDED
+        Value:           0x1
+      - Tag:             DT_NEEDED
+        Value:           0x28
+      - Tag:             DT_INIT
+        Value:           0x401000
+      - Tag:             DT_FINI
+        Value:           0x401C28
+      - Tag:             DT_INIT_ARRAY
+        Value:           0x403E00
+      - Tag:             DT_INIT_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_FINI_ARRAY
+        Value:           0x403E08
+      - Tag:             DT_FINI_ARRAYSZ
+        Value:           0x8
+      - Tag:             DT_GNU_HASH
+        Value:           0x400308
+      - Tag:             DT_STRTAB
+        Value:           0x400430
+      - Tag:             DT_SYMTAB
+        Value:           0x400328
+      - Tag:             DT_STRSZ
+        Value:           0x8A
+      - Tag:             DT_SYMENT
+        Value:           0x18
+      - Tag:             DT_DEBUG
+        Value:           0x0
+      - Tag:             DT_PLTGOT
+        Value:           0x404000
+      - Tag:             DT_PLTRELSZ
+        Value:           0xC0
+      - Tag:             DT_PLTREL
+        Value:           0x7
+      - Tag:             DT_JMPREL
+        Value:           0x400560
+      - Tag:             DT_RELA
+        Value:           0x400530
+      - Tag:             DT_RELASZ
+        Value:           0x30
+      - Tag:             DT_RELAENT
+        Value:           0x18
+      - Tag:             DT_VERNEED
+        Value:           0x4004D0
+      - Tag:             DT_VERNEEDNUM
+        Value:           0x2
+      - Tag:             DT_VERSYM
+        Value:           0x4004BA
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+      - Tag:             DT_NULL
+        Value:           0x0
+  - Name:            .got
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x403FF0
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         '00000000000000000000000000000000'
+  - Name:            .got.plt
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x404000
+    AddressAlign:    0x8
+    EntSize:         0x8
+    Content:         103E400000000000000000000000000000000000000000003610400000000000461040000000000056104000000000006610400000000000761040000000000086104000000000009610400000000000A610400000000000
+  - Name:            .data
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x404058
+    AddressAlign:    0x8
+    Content:         '00000000000000000000000000000000'
+  - Name:            .tm_clone_table
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x404068
+    AddressAlign:    0x8
+  - Name:            .bss
+    Type:            SHT_NOBITS
+    Flags:           [ SHF_WRITE, SHF_ALLOC ]
+    Address:         0x404068
+    AddressAlign:    0x1
+    Size:            0x8
+  - Name:            .comment
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_MERGE, SHF_STRINGS ]
+    AddressAlign:    0x1
+    EntSize:         0x1
+    Content:         4743433A20285562756E747520392E342E302D317562756E7475317E31362E30342920392E342E3000
+  - Name:            .rela.init
+    Type:            SHT_RELA
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x8
+    Info:            .init
+    Relocations:
+      - Offset:          0x40100B
+        Symbol:          __gmon_start__
+        Type:            R_X86_64_REX_GOTPCRELX
+        Addend:          -4
+  - Name:            .rela.text
+    Type:            SHT_RELA
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x8
+    Info:            .text
+    Relocations:
+      - Offset:          0x4010B3
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          8
+      - Offset:          0x4010D2
+        Symbol:          'puts@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4010E3
+        Symbol:          .LC6
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4010EB
+        Symbol:          .LC7
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4010F3
+        Symbol:          .LC8
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4010FF
+        Symbol:          .LC4
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401109
+        Symbol:          SolveCubic
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x40110E
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          191
+      - Offset:          0x401115
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x40112C
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          202
+      - Offset:          0x40113A
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x40114A
+        Symbol:          'putchar@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x40115B
+        Symbol:          .LC6
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401163
+        Symbol:          .LC11
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x40116B
+        Symbol:          .LC12
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401177
+        Symbol:          .LC4
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401181
+        Symbol:          SolveCubic
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401186
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          191
+      - Offset:          0x40118D
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4011A4
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          202
+      - Offset:          0x4011B2
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4011C2
+        Symbol:          'putchar@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4011C9
+        Symbol:          .LC4
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4011D1
+        Symbol:          .LC13
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4011DE
+        Symbol:          .LC14
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4011E6
+        Symbol:          .LC15
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4011F5
+        Symbol:          SolveCubic
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4011FA
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          191
+      - Offset:          0x401201
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401218
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          202
+      - Offset:          0x401226
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401236
+        Symbol:          'putchar@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401247
+        Symbol:          .LC4
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x40124F
+        Symbol:          .LC16
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401257
+        Symbol:          .LC17
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401266
+        Symbol:          SolveCubic
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x40126B
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          191
+      - Offset:          0x401272
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401289
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          202
+      - Offset:          0x401297
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4012A7
+        Symbol:          'putchar@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4012AE
+        Symbol:          .LC2
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4012B6
+        Symbol:          .LC18
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4012C3
+        Symbol:          .LC19
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4012CB
+        Symbol:          .LC20
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4012DA
+        Symbol:          SolveCubic
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4012DF
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          191
+      - Offset:          0x4012E6
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4012FB
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          202
+      - Offset:          0x401309
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401319
+        Symbol:          'putchar@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401321
+        Symbol:          .LC21
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401329
+        Symbol:          .LC22
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401336
+        Symbol:          .LC23
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x40133E
+        Symbol:          .LC24
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401348
+        Symbol:          SolveCubic
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x40134D
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          191
+      - Offset:          0x401354
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401369
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          202
+      - Offset:          0x401377
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401387
+        Symbol:          'putchar@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x40138F
+        Symbol:          .LC25
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401397
+        Symbol:          .LC26
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4013A4
+        Symbol:          .LC27
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4013AC
+        Symbol:          .LC28
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4013B6
+        Symbol:          SolveCubic
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4013BB
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          191
+      - Offset:          0x4013C2
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4013D7
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          202
+      - Offset:          0x4013E5
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4013F5
+        Symbol:          'putchar@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4013FD
+        Symbol:          .LC29
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401405
+        Symbol:          .LC30
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401412
+        Symbol:          .LC31
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x40141A
+        Symbol:          .LC32
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401424
+        Symbol:          SolveCubic
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401429
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          191
+      - Offset:          0x401430
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401445
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          202
+      - Offset:          0x401453
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401469
+        Symbol:          'putchar@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401470
+        Symbol:          .LC4
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x40147C
+        Symbol:          .LC3
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x40148E
+        Symbol:          .LC2
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4014A3
+        Symbol:          .LC0
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4014D2
+        Symbol:          SolveCubic
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4014D7
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          191
+      - Offset:          0x4014DE
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4014F3
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          202
+      - Offset:          0x401501
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401511
+        Symbol:          'putchar@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x40151E
+        Symbol:          .LC33
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401530
+        Symbol:          .LC34
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401554
+        Symbol:          .LC35
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x40156C
+        Symbol:          .LC4
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401587
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          48
+      - Offset:          0x40158E
+        Symbol:          'puts@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x40159B
+        Symbol:          usqrt
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4015A6
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          206
+      - Offset:          0x4015B1
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4015C9
+        Symbol:          'putchar@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4015D6
+        Symbol:          usqrt
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4015E4
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          223
+      - Offset:          0x4015ED
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4015FB
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          88
+      - Offset:          0x401600
+        Symbol:          'puts@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401612
+        Symbol:          deg2rad
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x40161C
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          128
+      - Offset:          0x40162E
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x40163B
+        Symbol:          .LC41
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401643
+        Symbol:          .LC42
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x40164E
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          238
+      - Offset:          0x401653
+        Symbol:          'puts@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401665
+        Symbol:          rad2deg
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x40166F
+        Symbol:          .rodata
+        Type:            R_X86_64_32
+        Addend:          160
+      - Offset:          0x401681
+        Symbol:          'printf@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x40168E
+        Symbol:          .LC45
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401696
+        Symbol:          .LC46
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4016C0
+        Symbol:          '__stack_chk_fail@@GLIBC_2.4'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4016E6
+        Symbol:          __libc_csu_fini
+        Type:            R_X86_64_32S
+      - Offset:          0x4016ED
+        Symbol:          __libc_csu_init
+        Type:            R_X86_64_32S
+      - Offset:          0x4016F4
+        Symbol:          main
+        Type:            R_X86_64_32S
+      - Offset:          0x4016FA
+        Symbol:          '__libc_start_main@@GLIBC_2.2.5'
+        Type:            R_X86_64_GOTPCRELX
+        Addend:          -4
+      - Offset:          0x401711
+        Symbol:          __TMC_END__
+        Type:            R_X86_64_32
+      - Offset:          0x401717
+        Symbol:          .tm_clone_table
+        Type:            R_X86_64_32S
+      - Offset:          0x40171E
+        Symbol:          _ITM_deregisterTMCloneTable
+        Type:            R_X86_64_32
+      - Offset:          0x401728
+        Symbol:          .tm_clone_table
+        Type:            R_X86_64_32
+      - Offset:          0x401741
+        Symbol:          __TMC_END__
+        Type:            R_X86_64_32
+      - Offset:          0x401748
+        Symbol:          .tm_clone_table
+        Type:            R_X86_64_32S
+      - Offset:          0x401760
+        Symbol:          _ITM_registerTMCloneTable
+        Type:            R_X86_64_32
+      - Offset:          0x40176A
+        Symbol:          .tm_clone_table
+        Type:            R_X86_64_32
+      - Offset:          0x401782
+        Symbol:          .bss
+        Type:            R_X86_64_PC32
+        Addend:          -5
+      - Offset:          0x401794
+        Symbol:          .bss
+        Type:            R_X86_64_PC32
+        Addend:          -5
+      - Offset:          0x4017C4
+        Symbol:          '.LC0 (1)'
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4017CC
+        Symbol:          .LC1
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4017E4
+        Symbol:          .LC1
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4017EC
+        Symbol:          '.LC0 (1)'
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401836
+        Symbol:          '.LC0 (2)'
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401840
+        Symbol:          '.LC1 (1)'
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401860
+        Symbol:          '.LC2 (1)'
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x40186C
+        Symbol:          '.LC3 (1)'
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4018B4
+        Symbol:          .LC9
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4018C6
+        Symbol:          .LC10
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4018D3
+        Symbol:          'pow@@GLIBC_2.29'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401903
+        Symbol:          '.LC12 (1)'
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401912
+        Symbol:          '.LC0 (2)'
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401965
+        Symbol:          'acos@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401997
+        Symbol:          '.LC6 (1)'
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x40199F
+        Symbol:          .LC5
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4019B4
+        Symbol:          'cos@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x4019C9
+        Symbol:          '.LC0 (2)'
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4019D1
+        Symbol:          '.LC7 (1)'
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x4019DF
+        Symbol:          'cos@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401A35
+        Symbol:          '.LC8 (1)'
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401A40
+        Symbol:          'cos@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401A6F
+        Symbol:          'sqrt@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401A8F
+        Symbol:          'sqrt@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401AB1
+        Symbol:          'sqrt@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401AB9
+        Symbol:          '.LC6 (1)'
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401ACD
+        Symbol:          .LC5
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401AE2
+        Symbol:          'cos@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401AF7
+        Symbol:          '.LC0 (2)'
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401B15
+        Symbol:          'sqrt@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401B23
+        Symbol:          '.LC7 (1)'
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401B2E
+        Symbol:          'cos@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401B53
+        Symbol:          'sqrt@@GLIBC_2.2.5'
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+      - Offset:          0x401BB9
+        Symbol:          __init_array_start
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401BD0
+        Symbol:          __init_array_end
+        Type:            R_X86_64_PC32
+        Addend:          -4
+      - Offset:          0x401BDD
+        Symbol:          _init
+        Type:            R_X86_64_PLT32
+        Addend:          -4
+  - Name:            .rela.eh_frame
+    Type:            SHT_RELA
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x8
+    Info:            .eh_frame
+    Relocations:
+      - Offset:          0x4022F0
+        Symbol:          .text
+        Type:            R_X86_64_PC32
+        Addend:          1568
+      - Offset:          0x402304
+        Symbol:          .text
+        Type:            R_X86_64_PC32
+        Addend:          1616
+      - Offset:          0x402340
+        Symbol:          .text
+        Type:            R_X86_64_PC32
+      - Offset:          0x402388
+        Symbol:          .text
+        Type:            R_X86_64_PC32
+        Addend:          1808
+      - Offset:          0x40239C
+        Symbol:          .text
+        Type:            R_X86_64_PC32
+        Addend:          1840
+      - Offset:          0x4023B0
+        Symbol:          .text
+        Type:            R_X86_64_PC32
+        Addend:          1872
+      - Offset:          0x4023E0
+        Symbol:          .text
+        Type:            R_X86_64_PC32
+        Addend:          2736
+      - Offset:          0x4023F8
+        Symbol:          .text
+        Type:            R_X86_64_PC32
+        Addend:          2816
+      - Offset:          0x402440
+        Symbol:          .text
+        Type:            R_X86_64_PC32
+        Addend:          2928
+  - Name:            .rela.init_array
+    Type:            SHT_RELA
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x8
+    Info:            .init_array
+    Relocations:
+      - Offset:          0x403E00
+        Symbol:          .text
+        Type:            R_X86_64_64
+        Addend:          1792
+  - Name:            .rela.fini_array
+    Type:            SHT_RELA
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x8
+    Info:            .fini_array
+    Relocations:
+      - Offset:          0x403E08
+        Symbol:          .text
+        Type:            R_X86_64_64
+        Addend:          1744
+  - Type:            SectionHeaderTable
+    Sections:
+      - Name:            .interp
+      - Name:            .note.gnu.build-id
+      - Name:            .note.ABI-tag
+      - Name:            .gnu.hash
+      - Name:            .dynsym
+      - Name:            .dynstr
+      - Name:            .gnu.version
+      - Name:            .gnu.version_r
+      - Name:            .rela.dyn
+      - Name:            .rela.plt
+      - Name:            .init
+      - Name:            .rela.init
+      - Name:            .plt
+      - Name:            .text
+      - Name:            .rela.text
+      - Name:            .fini
+      - Name:            .rodata
+      - Name:            .eh_frame_hdr
+      - Name:            .eh_frame
+      - Name:            .rela.eh_frame
+      - Name:            .init_array
+      - Name:            .rela.init_array
+      - Name:            .fini_array
+      - Name:            .rela.fini_array
+      - Name:            .dynamic
+      - Name:            .got
+      - Name:            .got.plt
+      - Name:            .data
+      - Name:            .tm_clone_table
+      - Name:            .bss
+      - Name:            .comment
+      - Name:            .symtab
+      - Name:            .strtab
+      - Name:            .shstrtab
+Symbols:
+  - Name:            .interp
+    Type:            STT_SECTION
+    Section:         .interp
+    Value:           0x4002A8
+  - Name:            .note.gnu.build-id
+    Type:            STT_SECTION
+    Section:         .note.gnu.build-id
+    Value:           0x4002C4
+  - Name:            .note.ABI-tag
+    Type:            STT_SECTION
+    Section:         .note.ABI-tag
+    Value:           0x4002E8
+  - Name:            .gnu.hash
+    Type:            STT_SECTION
+    Section:         .gnu.hash
+    Value:           0x400308
+  - Name:            .dynsym
+    Type:            STT_SECTION
+    Section:         .dynsym
+    Value:           0x400328
+  - Name:            .dynstr
+    Type:            STT_SECTION
+    Section:         .dynstr
+    Value:           0x400430
+  - Name:            .gnu.version
+    Type:            STT_SECTION
+    Section:         .gnu.version
+    Value:           0x4004BA
+  - Name:            .gnu.version_r
+    Type:            STT_SECTION
+    Section:         .gnu.version_r
+    Value:           0x4004D0
+  - Name:            .rela.dyn
+    Type:            STT_SECTION
+    Section:         .rela.dyn
+    Value:           0x400530
+  - Name:            .rela.plt
+    Type:            STT_SECTION
+    Section:         .rela.plt
+    Value:           0x400560
+  - Name:            .init
+    Type:            STT_SECTION
+    Section:         .init
+    Value:           0x401000
+  - Name:            .plt
+    Type:            STT_SECTION
+    Section:         .plt
+    Value:           0x401020
+  - Name:            .text
+    Type:            STT_SECTION
+    Section:         .text
+    Value:           0x4010B0
+  - Name:            .fini
+    Type:            STT_SECTION
+    Section:         .fini
+    Value:           0x401C28
+  - Name:            .rodata
+    Type:            STT_SECTION
+    Section:         .rodata
+    Value:           0x402000
+  - Name:            .eh_frame_hdr
+    Type:            STT_SECTION
+    Section:         .eh_frame_hdr
+    Value:           0x402270
+  - Name:            .eh_frame
+    Type:            STT_SECTION
+    Section:         .eh_frame
+    Value:           0x4022D0
+  - Name:            .init_array
+    Type:            STT_SECTION
+    Section:         .init_array
+    Value:           0x403E00
+  - Name:            .fini_array
+    Type:            STT_SECTION
+    Section:         .fini_array
+    Value:           0x403E08
+  - Name:            .dynamic
+    Type:            STT_SECTION
+    Section:         .dynamic
+    Value:           0x403E10
+  - Name:            .got
+    Type:            STT_SECTION
+    Section:         .got
+    Value:           0x403FF0
+  - Name:            .got.plt
+    Type:            STT_SECTION
+    Section:         .got.plt
+    Value:           0x404000
+  - Name:            .data
+    Type:            STT_SECTION
+    Section:         .data
+    Value:           0x404058
+  - Name:            .tm_clone_table
+    Type:            STT_SECTION
+    Section:         .tm_clone_table
+    Value:           0x404068
+  - Name:            .bss
+    Type:            STT_SECTION
+    Section:         .bss
+    Value:           0x404068
+  - Name:            .comment
+    Type:            STT_SECTION
+    Section:         .comment
+  - Name:            basicmath_large.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            .LC6
+    Section:         .rodata
+    Value:           0x402110
+  - Name:            .LC7
+    Section:         .rodata
+    Value:           0x402118
+  - Name:            .LC8
+    Section:         .rodata
+    Value:           0x402120
+  - Name:            .LC4
+    Section:         .rodata
+    Value:           0x402108
+  - Name:            .LC11
+    Section:         .rodata
+    Value:           0x402128
+  - Name:            .LC12
+    Section:         .rodata
+    Value:           0x402130
+  - Name:            .LC13
+    Section:         .rodata
+    Value:           0x402138
+  - Name:            .LC14
+    Section:         .rodata
+    Value:           0x402140
+  - Name:            .LC15
+    Section:         .rodata
+    Value:           0x402148
+  - Name:            .LC16
+    Section:         .rodata
+    Value:           0x402150
+  - Name:            .LC17
+    Section:         .rodata
+    Value:           0x402158
+  - Name:            .LC2
+    Section:         .rodata
+    Value:           0x4020F8
+  - Name:            .LC18
+    Section:         .rodata
+    Value:           0x402160
+  - Name:            .LC19
+    Section:         .rodata
+    Value:           0x402168
+  - Name:            .LC20
+    Section:         .rodata
+    Value:           0x402170
+  - Name:            .LC21
+    Section:         .rodata
+    Value:           0x402178
+  - Name:            .LC22
+    Section:         .rodata
+    Value:           0x402180
+  - Name:            .LC23
+    Section:         .rodata
+    Value:           0x402188
+  - Name:            .LC24
+    Section:         .rodata
+    Value:           0x402190
+  - Name:            .LC25
+    Section:         .rodata
+    Value:           0x402198
+  - Name:            .LC26
+    Section:         .rodata
+    Value:           0x4021A0
+  - Name:            .LC27
+    Section:         .rodata
+    Value:           0x4021A8
+  - Name:            .LC28
+    Section:         .rodata
+    Value:           0x4021B0
+  - Name:            .LC29
+    Section:         .rodata
+    Value:           0x4021B8
+  - Name:            .LC30
+    Section:         .rodata
+    Value:           0x4021C0
+  - Name:            .LC31
+    Section:         .rodata
+    Value:           0x4021C8
+  - Name:            .LC32
+    Section:         .rodata
+    Value:           0x4021D0
+  - Name:            .LC3
+    Section:         .rodata
+    Value:           0x402100
+  - Name:            .LC0
+    Section:         .rodata
+    Value:           0x4020F0
+  - Name:            .LC33
+    Section:         .rodata
+    Value:           0x4021D8
+  - Name:            .LC34
+    Section:         .rodata
+    Value:           0x4021E0
+  - Name:            .LC35
+    Section:         .rodata
+    Value:           0x4021E8
+  - Name:            .LC41
+    Section:         .rodata
+    Value:           0x4021F0
+  - Name:            .LC42
+    Section:         .rodata
+    Value:           0x4021F8
+  - Name:            .LC45
+    Section:         .rodata
+    Value:           0x402200
+  - Name:            .LC46
+    Section:         .rodata
+    Value:           0x402208
+  - Name:            crtstuff.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            __TMC_LIST__
+    Type:            STT_OBJECT
+    Section:         .tm_clone_table
+    Value:           0x404068
+  - Name:            deregister_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x401710
+  - Name:            register_tm_clones
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x401740
+  - Name:            __do_global_dtors_aux
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x401780
+  - Name:            completed.8023
+    Type:            STT_OBJECT
+    Section:         .bss
+    Value:           0x404068
+    Size:            0x1
+  - Name:            __do_global_dtors_aux_fini_array_entry
+    Type:            STT_OBJECT
+    Section:         .fini_array
+    Value:           0x403E08
+  - Name:            frame_dummy
+    Type:            STT_FUNC
+    Section:         .text
+    Value:           0x4017B0
+  - Name:            __frame_dummy_init_array_entry
+    Type:            STT_OBJECT
+    Section:         .init_array
+    Value:           0x403E00
+  - Name:            rad2deg.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            '.LC0 (1)'
+    Section:         .rodata
+    Value:           0x402210
+  - Name:            .LC1
+    Section:         .rodata
+    Value:           0x402218
+  - Name:            cubic.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            '.LC0 (2)'
+    Section:         .rodata
+    Value:           0x402220
+  - Name:            '.LC1 (1)'
+    Section:         .rodata
+    Value:           0x402224
+  - Name:            '.LC2 (1)'
+    Section:         .rodata
+    Value:           0x402228
+  - Name:            '.LC3 (1)'
+    Section:         .rodata
+    Value:           0x40222C
+  - Name:            .LC9
+    Section:         .rodata
+    Value:           0x402248
+  - Name:            .LC10
+    Section:         .rodata
+    Value:           0x402250
+  - Name:            '.LC12 (1)'
+    Section:         .rodata
+    Value:           0x402260
+  - Name:            '.LC6 (1)'
+    Section:         .rodata
+    Value:           0x402170
+  - Name:            .LC5
+    Section:         .rodata
+    Value:           0x402230
+  - Name:            '.LC7 (1)'
+    Section:         .rodata
+    Value:           0x402238
+  - Name:            '.LC8 (1)'
+    Section:         .rodata
+    Value:           0x402240
+  - Name:            isqrt.c
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            'crtstuff.c (1)'
+    Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            __FRAME_END__
+    Type:            STT_OBJECT
+    Section:         .eh_frame
+    Value:           0x40244C
+  - Type:            STT_FILE
+    Index:           SHN_ABS
+  - Name:            __init_array_end
+    Section:         .init_array
+    Value:           0x403E08
+  - Name:            _DYNAMIC
+    Type:            STT_OBJECT
+    Section:         .dynamic
+    Value:           0x403E10
+  - Name:            __init_array_start
+    Section:         .init_array
+    Value:           0x403E00
+  - Name:            __GNU_EH_FRAME_HDR
+    Section:         .eh_frame_hdr
+    Value:           0x402270
+  - Name:            _GLOBAL_OFFSET_TABLE_
+    Type:            STT_OBJECT
+    Section:         .got.plt
+    Value:           0x404000
+  - Name:            __libc_csu_fini
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x401C20
+    Size:            0x5
+  - Name:            'putchar@@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            _ITM_deregisterTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            data_start
+    Section:         .data
+    Binding:         STB_WEAK
+    Value:           0x404058
+  - Name:            'puts@@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            usqrt
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x401B60
+    Size:            0x43
+  - Name:            _edata
+    Section:         .tm_clone_table
+    Binding:         STB_GLOBAL
+    Value:           0x404068
+  - Name:            'pow@@GLIBC_2.29'
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            _fini
+    Type:            STT_FUNC
+    Section:         .fini
+    Binding:         STB_GLOBAL
+    Value:           0x401C28
+    Other:           [ STV_HIDDEN ]
+  - Name:            '__stack_chk_fail@@GLIBC_2.4'
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            'printf@@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            'cos@@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            'acos@@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            '__libc_start_main@@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            deg2rad
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x4017E0
+    Size:            0x11
+  - Name:            __data_start
+    Section:         .data
+    Binding:         STB_GLOBAL
+    Value:           0x404058
+  - Name:            SolveCubic
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x401800
+    Size:            0x35C
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            __dso_handle
+    Type:            STT_OBJECT
+    Section:         .data
+    Binding:         STB_GLOBAL
+    Value:           0x404060
+    Other:           [ STV_HIDDEN ]
+  - Name:            _IO_stdin_used
+    Type:            STT_OBJECT
+    Section:         .rodata
+    Binding:         STB_GLOBAL
+    Value:           0x402000
+    Size:            0x4
+  - Name:            __libc_csu_init
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x401BB0
+    Size:            0x65
+  - Name:            _end
+    Section:         .bss
+    Binding:         STB_GLOBAL
+    Value:           0x404070
+  - Name:            _dl_relocate_static_pie
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x401700
+    Size:            0x5
+    Other:           [ STV_HIDDEN ]
+  - Name:            _start
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x4016D0
+    Size:            0x2F
+  - Name:            rad2deg
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x4017C0
+    Size:            0x11
+  - Name:            __bss_start
+    Section:         .bss
+    Binding:         STB_GLOBAL
+    Value:           0x404068
+  - Name:            main
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x4010B0
+    Size:            0x614
+  - Name:            __TMC_END__
+    Type:            STT_OBJECT
+    Section:         .tm_clone_table
+    Binding:         STB_GLOBAL
+    Value:           0x404068
+    Other:           [ STV_HIDDEN ]
+  - Name:            _ITM_registerTMCloneTable
+    Binding:         STB_WEAK
+  - Name:            'sqrt@@GLIBC_2.2.5'
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            _init
+    Type:            STT_FUNC
+    Section:         .init
+    Binding:         STB_GLOBAL
+    Value:           0x401000
+    Other:           [ STV_HIDDEN ]
+DynamicSymbols:
+  - Name:            putchar
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            puts
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            pow
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __stack_chk_fail
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            printf
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            cos
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            acos
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __libc_start_main
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+  - Name:            __gmon_start__
+    Binding:         STB_WEAK
+  - Name:            sqrt
+    Type:            STT_FUNC
+    Binding:         STB_GLOBAL
+...
diff --git a/bolt/test/X86/Inputs/blarge_new_bat.preagg.txt b/bolt/test/X86/Inputs/blarge_new_bat.preagg.txt
new file mode 100644
index 000000000000..e9e4553aad95
--- /dev/null
+++ b/bolt/test/X86/Inputs/blarge_new_bat.preagg.txt
@@ -0,0 +1,79 @@
+B 40169e 40165b 7 0
+B 401664 800012 7 0
+B 401680 401070 7 0
+B 800022 401669 7 0
+B 401611 800000 121 0
+B 40162d 401070 119 0
+B 4015d5 800040 2 0
+B 800080 4015da 6 0
+B 40164b 401608 115 0
+B 800080 40159f 24 0
+B 4015ec 401070 6 0
+B 8001d0 401090 1 0
+B 4014d1 800082 25 0
+B 401510 401030 616 0
+B 8002ab 401080 1 0
+B 80007b 80004c 483 1
+B 800072 80004c 597 77
+B 80010c 800194 1 0
+B 401509 4014ec 1 0
+B 800010 401616 119 0
+B 80024a 401080 1 0
+B 800154 401050 20 0
+B 4014dd 401070 9 0
+B 80021f 401080 1 0
+B 800193 4014d6 8 0
+B 40159a 800040 19 0
+B 4015f8 4015cd 2 0
+B 40152a 4014b0 24 0
+B 401500 401070 15 0
+B 4015bc 401592 21 0
+B 401544 4014a0 1 0
+B 80004a 800052 24 0
+B 4015b0 401070 20 0
+B 800050 80007d 29 29
+F 401685 40169e 7
+F 4014a0 4014d1 1
+F 401090 401090 1
+F 401050 401050 20
+F 40159f 4015b0 20
+F 80007d 800080 27
+F 401515 401544 1
+F 4014e2 401500 13
+F 401592 40159a 19
+F 401505 401509 1
+F 4014b0 4014d1 24
+F 800194 8001d0 1
+F 8001d5 80021f 1
+F 401616 40162d 114
+F 80024f 8002ab 1
+F 800159 800193 7
+F 80004c 800050 26
+F 800224 80024a 1
+F 800082 80010c 1
+F 401080 401080 3
+F 401070 401070 168
+F 80004c 800072 555
+F 401616 800010 2
+F 401070 40162d 4
+F 800082 800154 20
+F 401669 401680 7
+F 40159f 800080 1
+F 4014ec 401500 1
+F 800012 800022 7
+F 401030 401030 616
+F 80004c 80007b 473
+F 800052 800072 24
+F 800040 80004a 21
+F 4015b5 4015bc 18
+F 4015cd 4015d5 2
+F 401592 4015bc 1
+F 4015da 4015ec 6
+F 4015f1 4015f8 2
+F 800000 800010 116
+F 401608 401611 115
+F 401632 40164b 114
+F 401515 40152a 24
+F 40165b 401664 7
+F 401505 401510 612
+F 4014d6 4014dd 8
diff --git a/bolt/test/X86/bolt-address-translation-yaml.test b/bolt/test/X86/bolt-address-translation-yaml.test
new file mode 100644
index 000000000000..25ff4e7fbfcc
--- /dev/null
+++ b/bolt/test/X86/bolt-address-translation-yaml.test
@@ -0,0 +1,40 @@
+# Check new BAT format containing hashes for YAML profile.
+
+RUN: yaml2obj %p/Inputs/blarge_new.yaml &> %t.exe
+RUN: llvm-bolt %t.exe -o %t.out --pa -p %p/Inputs/blarge_new.preagg.txt \
+RUN:   --reorder-blocks=ext-tsp --split-functions --split-strategy=cdsplit \
+RUN:   --reorder-functions=cdsort --enable-bat --dyno-stats --skip-funcs=main \
+RUN:   2>&1 | FileCheck --check-prefix WRITE-BAT-CHECK %s
+RUN: perf2bolt %t.out --pa -p %p/Inputs/blarge_new_bat.preagg.txt -w %t.yaml -o %t.fdata \
+RUN:   2>&1 | FileCheck --check-prefix READ-BAT-CHECK %s
+RUN: FileCheck --input-file %t.yaml --check-prefix YAML-BAT-CHECK %s
+# Check that YAML converted from fdata matches YAML created directly with BAT.
+RUN: llvm-bolt %t.exe -data %t.fdata -w %t.yaml-fdata -o /dev/null
+RUN: FileCheck --input-file %t.yaml-fdata --check-prefix YAML-BAT-CHECK %s
+
+# Test resulting YAML profile with the original binary (no-stale mode)
+RUN: llvm-bolt %t.exe -data %t.yaml -o %t.null -dyno-stats \
+RUN:   | FileCheck --check-prefix CHECK-BOLT-YAML %s
+
+WRITE-BAT-CHECK: BOLT-INFO: Wrote 5 BAT maps
+WRITE-BAT-CHECK: BOLT-INFO: Wrote 4 function and 22 basic block hashes
+WRITE-BAT-CHECK: BOLT-INFO: BAT section size (bytes): 344
+
+READ-BAT-CHECK-NOT: BOLT-ERROR: unable to save profile in YAML format for input file processed by BOLT
+READ-BAT-CHECK: BOLT-INFO: Parsed 5 BAT entries
+READ-BAT-CHECK: PERF2BOLT: read 79 aggregated LBR entries
+
+YAML-BAT-CHECK:      functions:
+YAML-BAT-CHECK:      - name:    main
+YAML-BAT-CHECK-NEXT:   fid:     2
+YAML-BAT-CHECK-NEXT:   hash:    0x9895746D48B2C876
+YAML-BAT-CHECK-NEXT:   exec:    0
+YAML-BAT-CHECK-NEXT:   nblocks: 46
+YAML-BAT-CHECK-NEXT:   blocks:
+YAML-BAT-CHECK-NEXT:   - bid:   0
+YAML-BAT-CHECK-NEXT:     insns: 26
+YAML-BAT-CHECK-NEXT:     hash:  0xA900AE79CFD40000
+YAML-BAT-CHECK-NEXT:     succ:  [ { bid: 3, cnt: 0 }, { bid: 1, cnt: 0 } ]
+
+CHECK-BOLT-YAML:      pre-processing profile using YAML profile reader
+CHECK-BOLT-YAML-NEXT: 1 out of 16 functions in the binary (6.2%) have non-empty execution profile
diff --git a/bolt/test/X86/bolt-address-translation.test b/bolt/test/X86/bolt-address-translation.test
index f2020af2edeb..4277b4e0d0fe 100644
--- a/bolt/test/X86/bolt-address-translation.test
+++ b/bolt/test/X86/bolt-address-translation.test
@@ -36,7 +36,8 @@
 #
 # CHECK:      BOLT: 3 out of 7 functions were overwritten.
 # CHECK:      BOLT-INFO: Wrote 6 BAT maps
-# CHECK:      BOLT-INFO: BAT section size (bytes): 336
+# CHECK:      BOLT-INFO: Wrote 3 function and 58 basic block hashes
+# CHECK:      BOLT-INFO: BAT section size (bytes): 816
 #
 # usqrt mappings (hot part). We match against any key (left side containing
 # the bolted binary offsets) because BOLT may change where it puts instructions
@@ -44,13 +45,13 @@
 # binary offsets (right side) should be the same because these addresses are
 # hardcoded in the blarge.yaml file.
 #
-# CHECK-BAT-DUMP:      Function Address: 0x401170
+# CHECK-BAT-DUMP:      Function Address: 0x401170, hash: 0xace6cbc638b31983
 # CHECK-BAT-DUMP-NEXT: BB mappings:
-# CHECK-BAT-DUMP-NEXT: 0x0 -> 0x0
+# CHECK-BAT-DUMP-NEXT: 0x0 -> 0x0 hash: 0x36007ba1d80c0000
 # CHECK-BAT-DUMP-NEXT: 0x8 -> 0x8 (branch)
-# CHECK-BAT-DUMP-NEXT: 0x{{.*}} -> 0x39
+# CHECK-BAT-DUMP-NEXT: 0x{{.*}} -> 0x39 hash: 0x5c06705524800039
 # CHECK-BAT-DUMP-NEXT: 0x{{.*}} -> 0x3d (branch)
-# CHECK-BAT-DUMP-NEXT: 0x{{.*}} -> 0x10
+# CHECK-BAT-DUMP-NEXT: 0x{{.*}} -> 0x10 hash: 0xd70d7a64320e0010
 # CHECK-BAT-DUMP-NEXT: 0x{{.*}} -> 0x30 (branch)
 #
 # CHECK-BAT-DUMP: 3 cold mappings
diff --git a/bolt/test/X86/dwarf5-label-low-pc.s b/bolt/test/X86/dwarf5-label-low-pc.s
index b71309716334..890d9e024d1a 100644
--- a/bolt/test/X86/dwarf5-label-low-pc.s
+++ b/bolt/test/X86/dwarf5-label-low-pc.s
@@ -8,6 +8,7 @@
 
 # RUN: llvm-dwarfdump --show-form --verbose --debug-addr %t.bolt > %t.txt
 # RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt >> %t.txt
+# RUN: cat %t.txt | FileCheck --check-prefix=POSTCHECK %s
 
 # This test checks that we correctly handle DW_AT_low_pc [DW_FORM_addrx] that is part of DW_TAG_label.
 
@@ -35,16 +36,14 @@
 # POSTCHECK-NEXT: DW_AT_name
 # POSTCHECK-NEXT: DW_AT_decl_file
 # POSTCHECK-NEXT: DW_AT_decl_line
-# POSTCHECK-NEXT:
 # POSTCHECK-NEXT:DW_AT_low_pc [DW_FORM_addrx]  (indexed (00000002)
-# POSTCHECK-SAME: [0x[[#ADDR]]
+# POSTCHECK-SAME: 0x[[#ADDR]]
 # POSTCHECK: DW_TAG_label
 # POSTCHECK-NEXT: DW_AT_name
 # POSTCHECK-NEXT: DW_AT_decl_file
 # POSTCHECK-NEXT: DW_AT_decl_line
-# POSTCHECK-NEXT:
 # POSTCHECK-NEXT:DW_AT_low_pc [DW_FORM_addrx]  (indexed (00000003)
-# POSTCHECK-SAME: [0x[[#ADDR2]]
+# POSTCHECK-SAME: 0x[[#ADDR2]]
 
 # clang++ main.cpp -g -S
 # int main() {
diff --git a/bolt/test/X86/linux-static-keys.s b/bolt/test/X86/linux-static-keys.s
new file mode 100644
index 000000000000..08454bf97631
--- /dev/null
+++ b/bolt/test/X86/linux-static-keys.s
@@ -0,0 +1,67 @@
+# REQUIRES: system-linux
+
+## Check that BOLT correctly updates the Linux kernel static keys jump table.
+
+# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags -nostdlib %t.o -o %t.exe \
+# RUN:   -Wl,--image-base=0xffffffff80000000,--no-dynamic-linker,--no-eh-frame-hdr
+
+## Verify static keys jump bindings to instructions.
+
+# RUN: llvm-bolt %t.exe --print-normalized -o %t.out --keep-nops=0 \
+# RUN:   --bolt-info=0 |& FileCheck %s
+
+## Verify the bindings again on the rewritten binary with nops removed.
+
+# RUN: llvm-bolt %t.out -o %t.out.1 --print-normalized |& FileCheck %s
+
+# CHECK:      BOLT-INFO: Linux kernel binary detected
+# CHECK:      BOLT-INFO: parsed 2 static keys jump entries
+
+  .text
+  .globl _start
+  .type _start, %function
+_start:
+# CHECK: Binary Function "_start"
+  nop
+.L0:
+  jmp .L1
+# CHECK:      jit
+# CHECK-SAME: # ID: 1 {{.*}} # Likely: 0 # InitValue: 1
+  nop
+.L1:
+  .nops 5
+# CHECK:      jit
+# CHECK-SAME: # ID: 2 {{.*}} # Likely: 1 # InitValue: 1
+.L2:
+  nop
+  .size _start, .-_start
+
+  .globl foo
+  .type foo, %function
+foo:
+  ret
+  .size foo, .-foo
+
+
+## Static keys jump table.
+  .rodata
+  .globl __start___jump_table
+  .type __start___jump_table, %object
+__start___jump_table:
+
+  .long .L0 - . # Jump address
+  .long .L1 - . # Target address
+  .quad 1       # Key address
+
+  .long .L1 - . # Jump address
+  .long .L2 - . # Target address
+  .quad 0       # Key address
+
+  .globl __stop___jump_table
+  .type __stop___jump_table, %object
+__stop___jump_table:
+
+## Fake Linux Kernel sections.
+  .section __ksymtab,"a",@progbits
+  .section __ksymtab_gpl,"a",@progbits
diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index 47fc2e4886cf..761dab8c28c1 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -190,11 +190,12 @@ set(CLANG_RESOURCE_DIR "" CACHE STRING
 set(C_INCLUDE_DIRS "" CACHE STRING
   "Colon separated list of directories clang will search for headers.")
 
+set(USE_DEPRECATED_GCC_INSTALL_PREFIX OFF CACHE BOOL "Temporary workaround before GCC_INSTALL_PREFIX is completely removed")
 set(GCC_INSTALL_PREFIX "" CACHE PATH "Directory where gcc is installed." )
 set(DEFAULT_SYSROOT "" CACHE STRING
   "Default <path> to all compiler invocations for --sysroot=<path>." )
-if(GCC_INSTALL_PREFIX)
-  message(WARNING "GCC_INSTALL_PREFIX is deprecated and will be removed. Use "
+if(GCC_INSTALL_PREFIX AND NOT USE_DEPRECATED_GCC_INSTALL_PREFIX)
+  message(FATAL_ERROR "GCC_INSTALL_PREFIX is deprecated and will be removed. Use "
     "configuration files (https://clang.llvm.org/docs/UsersManual.html#configuration-files)"
     "to specify the default --gcc-install-dir= or --gcc-triple=. --gcc-toolchain= is discouraged. "
     "See https://github.com/llvm/llvm-project/pull/77537 for detail.")
diff --git a/clang/docs/ClangFormat.rst b/clang/docs/ClangFormat.rst
index 819d9ee9f9cd..80dc38a075c8 100644
--- a/clang/docs/ClangFormat.rst
+++ b/clang/docs/ClangFormat.rst
@@ -61,6 +61,7 @@ to format C/C++/Java/JavaScript/JSON/Objective-C/Protobuf/C# code.
     --dry-run                      - If set, do not actually make the formatting changes
     --dump-config                  - Dump configuration options to stdout and exit.
                                      Can be used with -style option.
+    --fail-on-incomplete-format    - If set, fail with exit code 1 on incomplete format.
     --fallback-style=<string>      - The name of the predefined style used as a
                                      fallback in case clang-format is invoked with
                                      -style=file, but can not find the .clang-format
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 13d7261d83d7..5711972b55e6 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -1459,40 +1459,45 @@ More information could be found `here <https://clang.llvm.org/docs/Modules.html>
 Language Extensions Back-ported to Previous Standards
 =====================================================
 
-====================================== ================================ ============= =============
-Feature                                Feature Test Macro               Introduced In Backported To
-====================================== ================================ ============= =============
-variadic templates                     __cpp_variadic_templates         C++11         C++03
-Alias templates                        __cpp_alias_templates            C++11         C++03
-Non-static data member initializers    __cpp_nsdmi                      C++11         C++03
-Range-based ``for`` loop               __cpp_range_based_for            C++11         C++03
-RValue references                      __cpp_rvalue_references          C++11         C++03
-Attributes                             __cpp_attributes                 C++11         C++03
-variable templates                     __cpp_variable_templates         C++14         C++03
-Binary literals                        __cpp_binary_literals            C++14         C++03
-Relaxed constexpr                      __cpp_constexpr                  C++14         C++11
-``if constexpr``                       __cpp_if_constexpr               C++17         C++11
-fold expressions                       __cpp_fold_expressions           C++17         C++03
-Lambda capture of \*this by value      __cpp_capture_star_this          C++17         C++11
-Attributes on enums                    __cpp_enumerator_attributes      C++17         C++03
-Guaranteed copy elision                __cpp_guaranteed_copy_elision    C++17         C++03
-Hexadecimal floating literals          __cpp_hex_float                  C++17         C++03
-``inline`` variables                   __cpp_inline_variables           C++17         C++03
-Attributes on namespaces               __cpp_namespace_attributes       C++17         C++11
-Structured bindings                    __cpp_structured_bindings        C++17         C++03
-template template arguments            __cpp_template_template_args     C++17         C++03
-``static operator[]``                  __cpp_multidimensional_subscript C++20         C++03
-Designated initializers                __cpp_designated_initializers    C++20         C++03
-Conditional ``explicit``               __cpp_conditional_explicit       C++20         C++03
-``using enum``                         __cpp_using_enum                 C++20         C++03
-``if consteval``                       __cpp_if_consteval               C++23         C++20
-``static operator()``                  __cpp_static_call_operator       C++23         C++03
-Attributes on Lambda-Expressions                                        C++23         C++11
--------------------------------------- -------------------------------- ------------- -------------
-Designated initializers (N494)                                          C99           C89
-Array & element qualification (N2607)                                   C23           C89
-Attributes (N2335)                                                      C23           C89
-====================================== ================================ ============= =============
+============================================ ================================ ============= =============
+Feature                                      Feature Test Macro               Introduced In Backported To
+============================================ ================================ ============= =============
+variadic templates                           __cpp_variadic_templates         C++11         C++03
+Alias templates                              __cpp_alias_templates            C++11         C++03
+Non-static data member initializers          __cpp_nsdmi                      C++11         C++03
+Range-based ``for`` loop                     __cpp_range_based_for            C++11         C++03
+RValue references                            __cpp_rvalue_references          C++11         C++03
+Attributes                                   __cpp_attributes                 C++11         C++03
+Lambdas                                      __cpp_lambdas                    C++11         C++03
+Generalized lambda captures                  __cpp_init_captures              C++14         C++03
+Generic lambda expressions                   __cpp_generic_lambdas            C++14         C++03
+variable templates                           __cpp_variable_templates         C++14         C++03
+Binary literals                              __cpp_binary_literals            C++14         C++03
+Relaxed constexpr                            __cpp_constexpr                  C++14         C++11
+Pack expansion in generalized lambda-capture __cpp_init_captures              C++17         C++03
+``if constexpr``                             __cpp_if_constexpr               C++17         C++11
+fold expressions                             __cpp_fold_expressions           C++17         C++03
+Lambda capture of \*this by value            __cpp_capture_star_this          C++17         C++03
+Attributes on enums                          __cpp_enumerator_attributes      C++17         C++03
+Guaranteed copy elision                      __cpp_guaranteed_copy_elision    C++17         C++03
+Hexadecimal floating literals                __cpp_hex_float                  C++17         C++03
+``inline`` variables                         __cpp_inline_variables           C++17         C++03
+Attributes on namespaces                     __cpp_namespace_attributes       C++17         C++11
+Structured bindings                          __cpp_structured_bindings        C++17         C++03
+template template arguments                  __cpp_template_template_args     C++17         C++03
+Familiar template syntax for generic lambdas __cpp_generic_lambdas            C++20         C++03
+``static operator[]``                        __cpp_multidimensional_subscript C++20         C++03
+Designated initializers                      __cpp_designated_initializers    C++20         C++03
+Conditional ``explicit``                     __cpp_conditional_explicit       C++20         C++03
+``using enum``                               __cpp_using_enum                 C++20         C++03
+``if consteval``                             __cpp_if_consteval               C++23         C++20
+``static operator()``                        __cpp_static_call_operator       C++23         C++03
+Attributes on Lambda-Expressions                                              C++23         C++11
+-------------------------------------------- -------------------------------- ------------- -------------
+Designated initializers (N494)                                                C99           C89
+Array & element qualification (N2607)                                         C23           C89
+Attributes (N2335)                                                            C23           C89
+============================================ ================================ ============= =============
 
 Type Trait Primitives
 =====================
@@ -3548,6 +3553,47 @@ argument can be of any unsigned integer type.
 ``__builtin_popcount{,l,ll}`` builtins, with support for other integer types,
 such as ``unsigned __int128`` and C23 ``unsigned _BitInt(N)``.
 
+``__builtin_clzg`` and ``__builtin_ctzg``
+-----------------------------------------
+
+``__builtin_clzg`` (respectively ``__builtin_ctzg``) returns the number of
+leading (respectively trailing) 0 bits in the first argument. The first argument
+can be of any unsigned integer type.
+
+If the first argument is 0 and an optional second argument of ``int`` type is
+provided, then the second argument is returned. If the first argument is 0, but
+only one argument is provided, then the behavior is undefined.
+
+**Syntax**:
+
+.. code-block:: c++
+
+  int __builtin_clzg(type x[, int fallback])
+  int __builtin_ctzg(type x[, int fallback])
+
+**Examples**:
+
+.. code-block:: c++
+
+  unsigned int x = 1;
+  int x_lz = __builtin_clzg(x);
+  int x_tz = __builtin_ctzg(x);
+
+  unsigned long y = 2;
+  int y_lz = __builtin_clzg(y);
+  int y_tz = __builtin_ctzg(y);
+
+  unsigned _BitInt(128) z = 4;
+  int z_lz = __builtin_clzg(z);
+  int z_tz = __builtin_ctzg(z);
+
+**Description**:
+
+``__builtin_clzg`` (respectively ``__builtin_ctzg``) is meant to be a
+type-generic alternative to the ``__builtin_clz{,l,ll}`` (respectively
+``__builtin_ctz{,l,ll}``) builtins, with support for other integer types, such
+as ``unsigned __int128`` and C23 ``unsigned _BitInt(N)``.
+
 Multiprecision Arithmetic Builtins
 ----------------------------------
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c0b0c8a8a3ea..fd12bb41be47 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -37,6 +37,9 @@ These changes are ones which we think may surprise users when upgrading to
 Clang |release| because of the opportunity they pose for disruption to existing
 code bases.
 
+- Setting the deprecated CMake variable ``GCC_INSTALL_PREFIX`` (which sets the
+  default ``--gcc-toolchain=``) now leads to a fatal error.
+
 C/C++ Language Potentially Breaking Changes
 -------------------------------------------
 
@@ -177,6 +180,8 @@ Non-comprehensive list of changes in this release
   the previous builtins, this new builtin is constexpr and may be used in
   constant expressions.
 
+- Lambda expressions are now accepted in C++03 mode as an extension.
+
 New Compiler Flags
 ------------------
 
@@ -193,7 +198,25 @@ Modified Compiler Flags
   ``-Wreturn-type``, and moved some of the diagnostics previously controlled by
   ``-Wreturn-type`` under this new flag. Fixes #GH72116.
 
-- Added ``-Wcast-function-type`` as a warning enabled by ``-Wextra``. #GH76872
+- Added ``-Wcast-function-type-mismatch`` under the ``-Wcast-function-type``
+  warning group. Moved the diagnostic previously controlled by
+  ``-Wcast-function-type`` to the new warning group and added
+  ``-Wcast-function-type-mismatch`` to ``-Wextra``. #GH76872
+
+  .. code-block:: c
+
+     int x(long);
+     typedef int (f2)(void*);
+     typedef int (f3)();
+
+     void func(void) {
+       // Diagnoses under -Wcast-function-type, -Wcast-function-type-mismatch,
+       // -Wcast-function-type-strict, -Wextra
+       f2 *b = (f2 *)x;
+       // Diagnoses under -Wcast-function-type, -Wcast-function-type-strict
+       f3 *c = (f3 *)x;
+     }
+
 
 Removed Compiler Flags
 -------------------------
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 491c9d895413..21ab9bb86d1b 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -676,7 +676,11 @@ def Clz : Builtin, BitShort_Int_Long_LongLongTemplate {
   let Prototype = "int(unsigned T)";
 }
 
-// FIXME: Add int clzimax(uintmax_t)
+def Clzg : Builtin {
+  let Spellings = ["__builtin_clzg"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "int(...)";
+}
 
 def Ctz : Builtin, BitShort_Int_Long_LongLongTemplate {
   let Spellings = ["__builtin_ctz"];
@@ -684,7 +688,11 @@ def Ctz : Builtin, BitShort_Int_Long_LongLongTemplate {
   let Prototype = "int(unsigned T)";
 }
 
-// FIXME: Add int ctzimax(uintmax_t)
+def Ctzg : Builtin {
+  let Spellings = ["__builtin_ctzg"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "int(...)";
+}
 
 def FFS : Builtin, BitInt_Long_LongLongTemplate {
   let Spellings = ["__builtin_ffs"];
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index bf03d4e8f67e..44035e2fd16f 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -573,7 +573,10 @@ def SelTypeCast : DiagGroup<"cast-of-sel-type">;
 def FunctionDefInObjCContainer : DiagGroup<"function-def-in-objc-container">;
 def BadFunctionCast : DiagGroup<"bad-function-cast">;
 def CastFunctionTypeStrict : DiagGroup<"cast-function-type-strict">;
-def CastFunctionType : DiagGroup<"cast-function-type", [CastFunctionTypeStrict]>;
+def CastFunctionTypeMismatch : DiagGroup<"cast-function-type-mismatch">;
+def CastFunctionType : DiagGroup<"cast-function-type",
+                                 [CastFunctionTypeStrict,
+                                  CastFunctionTypeMismatch]>;
 def ObjCPropertyImpl : DiagGroup<"objc-property-implementation">;
 def ObjCPropertyNoAttribute : DiagGroup<"objc-property-no-attribute">;
 def ObjCPropertyAssignOnObjectType : DiagGroup<"objc-property-assign-on-object-type">;
@@ -1038,7 +1041,7 @@ def Extra : DiagGroup<"extra", [
     EmptyInitStatement,
     StringConcatation,
     FUseLdPath,
-    CastFunctionType,
+    CastFunctionTypeMismatch,
   ]>;
 
 def Most : DiagGroup<"most", [
diff --git a/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td b/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
index f99a5fca64cb..a4c6e630ac5f 100644
--- a/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
+++ b/clang/include/clang/Basic/DiagnosticInstallAPIKinds.td
@@ -26,6 +26,7 @@ def warn_library_hidden_symbol : Warning<"declaration has external linkage, but
 def warn_header_hidden_symbol : Warning<"symbol exported in dynamic library, but marked hidden in declaration '%0'">, InGroup<InstallAPIViolation>;
 def err_header_hidden_symbol : Error<"symbol exported in dynamic library, but marked hidden in declaration '%0'">;
 def err_header_symbol_missing : Error<"no declaration found for exported symbol '%0' in dynamic library">;
+def warn_header_symbol_missing : Warning<"no declaration was found for exported symbol '%0' in dynamic library">, InGroup<InstallAPIViolation>;
 def warn_header_availability_mismatch : Warning<"declaration '%0' is marked %select{available|unavailable}1,"
   " but symbol is %select{not |}2exported in dynamic library">, InGroup<InstallAPIViolation>;
 def err_header_availability_mismatch : Error<"declaration '%0' is marked %select{available|unavailable}1,"
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 816c3ff5f8b2..48de5e2ef5f4 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1029,6 +1029,7 @@ def err_expected_lambda_body : Error<"expected body of lambda expression">;
 def warn_cxx98_compat_lambda : Warning<
   "lambda expressions are incompatible with C++98">,
   InGroup<CXX98Compat>, DefaultIgnore;
+def ext_lambda : ExtWarn<"lambdas are a C++11 extension">, InGroup<CXX11>;
 def err_lambda_decl_specifier_repeated : Error<
   "%select{'mutable'|'static'|'constexpr'|'consteval'}0 cannot "
   "appear multiple times in a lambda declarator">;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 2646942a53e3..fc727cef9cd8 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9058,7 +9058,7 @@ def warn_bad_function_cast : Warning<
   InGroup<BadFunctionCast>, DefaultIgnore;
 def warn_cast_function_type : Warning<
   "cast %diff{from $ to $ |}0,1converts to incompatible function type">,
-  InGroup<CastFunctionType>, DefaultIgnore;
+  InGroup<CastFunctionTypeMismatch>, DefaultIgnore;
 def warn_cast_function_type_strict : Warning<warn_cast_function_type.Summary>,
   InGroup<CastFunctionTypeStrict>, DefaultIgnore;
 def err_cast_pointer_to_non_pointer_int : Error<
@@ -12020,13 +12020,14 @@ def err_builtin_launder_invalid_arg : Error<
   "'__builtin_launder' is not allowed">;
 
 def err_builtin_invalid_arg_type: Error <
-  "%ordinal0 argument must be a "
-  "%select{vector, integer or floating point type|matrix|"
-  "pointer to a valid matrix element type|"
-  "signed integer or floating point type|vector type|"
-  "floating point type|"
-  "vector of integers|"
-  "type of unsigned integer}1 (was %2)">;
+  "%ordinal0 argument must be "
+  "%select{a vector, integer or floating point type|a matrix|"
+  "a pointer to a valid matrix element type|"
+  "a signed integer or floating point type|a vector type|"
+  "a floating point type|"
+  "a vector of integers|"
+  "an unsigned integer|"
+  "an 'int'}1 (was %2)">;
 
 def err_builtin_matrix_disabled: Error<
   "matrix types extension is disabled. Pass -fenable-matrix to enable it">;
diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def
index 726ead4b5ab5..b41aadc73f20 100644
--- a/clang/include/clang/Basic/Features.def
+++ b/clang/include/clang/Basic/Features.def
@@ -261,6 +261,7 @@ EXTENSION(cxx_defaulted_functions, LangOpts.CPlusPlus)
 EXTENSION(cxx_deleted_functions, LangOpts.CPlusPlus)
 EXTENSION(cxx_explicit_conversions, LangOpts.CPlusPlus)
 EXTENSION(cxx_inline_namespaces, LangOpts.CPlusPlus)
+EXTENSION(cxx_lambdas, LangOpts.CPlusPlus)
 EXTENSION(cxx_local_type_template_args, LangOpts.CPlusPlus)
 EXTENSION(cxx_nonstatic_member_init, LangOpts.CPlusPlus)
 EXTENSION(cxx_override_control, LangOpts.CPlusPlus)
diff --git a/clang/include/clang/InstallAPI/DylibVerifier.h b/clang/include/clang/InstallAPI/DylibVerifier.h
index bbfa8711313e..49de24763f1f 100644
--- a/clang/include/clang/InstallAPI/DylibVerifier.h
+++ b/clang/include/clang/InstallAPI/DylibVerifier.h
@@ -28,7 +28,7 @@ enum class VerificationMode {
 /// lifetime of InstallAPI.
 /// As declarations are collected during AST traversal, they are
 /// compared as symbols against what is available in the binary dylib.
-class DylibVerifier {
+class DylibVerifier : llvm::MachO::RecordVisitor {
 private:
   struct SymbolContext;
 
@@ -72,6 +72,9 @@ public:
   Result verify(ObjCIVarRecord *R, const FrontendAttrs *FA,
                 const StringRef SuperClass);
 
+  // Scan through dylib slices and report any remaining missing exports.
+  Result verifyRemainingSymbols();
+
   /// Initialize target for verification.
   void setTarget(const Target &T);
 
@@ -128,6 +131,14 @@ private:
   /// Find matching dylib slice for target triple that is being parsed.
   void assignSlice(const Target &T);
 
+  /// Shared implementation for verifying exported symbols in dylib.
+  void visitSymbolInDylib(const Record &R, SymbolContext &SymCtx);
+
+  void visitGlobal(const GlobalRecord &R) override;
+  void visitObjCInterface(const ObjCInterfaceRecord &R) override;
+  void visitObjCCategory(const ObjCCategoryRecord &R) override;
+  void visitObjCIVar(const ObjCIVarRecord &R, const StringRef Super);
+
   /// Gather annotations for symbol for error reporting.
   std::string getAnnotatedName(const Record *R, SymbolContext &SymCtx,
                                bool ValidSourceLoc = true);
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
index ca75c2a756a4..51d76dc257ee 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
@@ -494,6 +494,8 @@ private:
                         InvalidatedSymbols *IS,
                         RegionAndSymbolInvalidationTraits *HTraits,
                         const CallEvent *Call) const;
+
+  SVal wrapSymbolicRegion(SVal Base) const;
 };
 
 //===----------------------------------------------------------------------===//
@@ -782,20 +784,6 @@ inline SVal ProgramState::getLValue(const ObjCIvarDecl *D, SVal Base) const {
   return getStateManager().StoreMgr->getLValueIvar(D, Base);
 }
 
-inline SVal ProgramState::getLValue(const FieldDecl *D, SVal Base) const {
-  return getStateManager().StoreMgr->getLValueField(D, Base);
-}
-
-inline SVal ProgramState::getLValue(const IndirectFieldDecl *D,
-                                    SVal Base) const {
-  StoreManager &SM = *getStateManager().StoreMgr;
-  for (const auto *I : D->chain()) {
-    Base = SM.getLValueField(cast<FieldDecl>(I), Base);
-  }
-
-  return Base;
-}
-
 inline SVal ProgramState::getLValue(QualType ElementType, SVal Idx, SVal Base) const{
   if (std::optional<NonLoc> N = Idx.getAs<NonLoc>())
     return getStateManager().StoreMgr->getLValueElement(ElementType, *N, Base);
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 77cb269d43c5..e14e89088282 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -3128,36 +3128,66 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__builtin_ctzs:
   case Builtin::BI__builtin_ctz:
   case Builtin::BI__builtin_ctzl:
-  case Builtin::BI__builtin_ctzll: {
-    Value *ArgValue = EmitCheckedArgForBuiltin(E->getArg(0), BCK_CTZPassedZero);
+  case Builtin::BI__builtin_ctzll:
+  case Builtin::BI__builtin_ctzg: {
+    bool HasFallback = BuiltinIDIfNoAsmLabel == Builtin::BI__builtin_ctzg &&
+                       E->getNumArgs() > 1;
+
+    Value *ArgValue =
+        HasFallback ? EmitScalarExpr(E->getArg(0))
+                    : EmitCheckedArgForBuiltin(E->getArg(0), BCK_CTZPassedZero);
 
     llvm::Type *ArgType = ArgValue->getType();
     Function *F = CGM.getIntrinsic(Intrinsic::cttz, ArgType);
 
     llvm::Type *ResultType = ConvertType(E->getType());
-    Value *ZeroUndef = Builder.getInt1(getTarget().isCLZForZeroUndef());
+    Value *ZeroUndef =
+        Builder.getInt1(HasFallback || getTarget().isCLZForZeroUndef());
     Value *Result = Builder.CreateCall(F, {ArgValue, ZeroUndef});
     if (Result->getType() != ResultType)
       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
                                      "cast");
-    return RValue::get(Result);
+    if (!HasFallback)
+      return RValue::get(Result);
+
+    Value *Zero = Constant::getNullValue(ArgType);
+    Value *IsZero = Builder.CreateICmpEQ(ArgValue, Zero, "iszero");
+    Value *FallbackValue = EmitScalarExpr(E->getArg(1));
+    Value *ResultOrFallback =
+        Builder.CreateSelect(IsZero, FallbackValue, Result, "ctzg");
+    return RValue::get(ResultOrFallback);
   }
   case Builtin::BI__builtin_clzs:
   case Builtin::BI__builtin_clz:
   case Builtin::BI__builtin_clzl:
-  case Builtin::BI__builtin_clzll: {
-    Value *ArgValue = EmitCheckedArgForBuiltin(E->getArg(0), BCK_CLZPassedZero);
+  case Builtin::BI__builtin_clzll:
+  case Builtin::BI__builtin_clzg: {
+    bool HasFallback = BuiltinIDIfNoAsmLabel == Builtin::BI__builtin_clzg &&
+                       E->getNumArgs() > 1;
+
+    Value *ArgValue =
+        HasFallback ? EmitScalarExpr(E->getArg(0))
+                    : EmitCheckedArgForBuiltin(E->getArg(0), BCK_CLZPassedZero);
 
     llvm::Type *ArgType = ArgValue->getType();
     Function *F = CGM.getIntrinsic(Intrinsic::ctlz, ArgType);
 
     llvm::Type *ResultType = ConvertType(E->getType());
-    Value *ZeroUndef = Builder.getInt1(getTarget().isCLZForZeroUndef());
+    Value *ZeroUndef =
+        Builder.getInt1(HasFallback || getTarget().isCLZForZeroUndef());
     Value *Result = Builder.CreateCall(F, {ArgValue, ZeroUndef});
     if (Result->getType() != ResultType)
       Result = Builder.CreateIntCast(Result, ResultType, /*isSigned*/true,
                                      "cast");
-    return RValue::get(Result);
+    if (!HasFallback)
+      return RValue::get(Result);
+
+    Value *Zero = Constant::getNullValue(ArgType);
+    Value *IsZero = Builder.CreateICmpEQ(ArgValue, Zero, "iszero");
+    Value *FallbackValue = EmitScalarExpr(E->getArg(1));
+    Value *ResultOrFallback =
+        Builder.CreateSelect(IsZero, FallbackValue, Result, "clzg");
+    return RValue::get(ResultOrFallback);
   }
   case Builtin::BI__builtin_ffs:
   case Builtin::BI__builtin_ffsl:
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index dc42faf8dbb9..2ef5ed04af30 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -1242,27 +1242,38 @@ static void emitStoresForConstant(CodeGenModule &CGM, const VarDecl &D,
     return;
   }
 
-  // If the initializer is small, use a handful of stores.
+  // If the initializer is small or trivialAutoVarInit is set, use a handful of
+  // stores.
+  bool IsTrivialAutoVarInitPattern =
+      CGM.getContext().getLangOpts().getTrivialAutoVarInit() ==
+      LangOptions::TrivialAutoVarInitKind::Pattern;
   if (shouldSplitConstantStore(CGM, ConstantSize)) {
     if (auto *STy = dyn_cast<llvm::StructType>(Ty)) {
-      const llvm::StructLayout *Layout =
-          CGM.getDataLayout().getStructLayout(STy);
-      for (unsigned i = 0; i != constant->getNumOperands(); i++) {
-        CharUnits CurOff = CharUnits::fromQuantity(Layout->getElementOffset(i));
-        Address EltPtr = Builder.CreateConstInBoundsByteGEP(
-            Loc.withElementType(CGM.Int8Ty), CurOff);
-        emitStoresForConstant(CGM, D, EltPtr, isVolatile, Builder,
-                              constant->getAggregateElement(i), IsAutoInit);
+      if (STy == Loc.getElementType() ||
+          (STy != Loc.getElementType() && IsTrivialAutoVarInitPattern)) {
+        const llvm::StructLayout *Layout =
+            CGM.getDataLayout().getStructLayout(STy);
+        for (unsigned i = 0; i != constant->getNumOperands(); i++) {
+          CharUnits CurOff =
+              CharUnits::fromQuantity(Layout->getElementOffset(i));
+          Address EltPtr = Builder.CreateConstInBoundsByteGEP(
+              Loc.withElementType(CGM.Int8Ty), CurOff);
+          emitStoresForConstant(CGM, D, EltPtr, isVolatile, Builder,
+                                constant->getAggregateElement(i), IsAutoInit);
+        }
+        return;
       }
-      return;
     } else if (auto *ATy = dyn_cast<llvm::ArrayType>(Ty)) {
-      for (unsigned i = 0; i != ATy->getNumElements(); i++) {
-        Address EltPtr = Builder.CreateConstGEP(
-            Loc.withElementType(ATy->getElementType()), i);
-        emitStoresForConstant(CGM, D, EltPtr, isVolatile, Builder,
-                              constant->getAggregateElement(i), IsAutoInit);
+      if (ATy == Loc.getElementType() ||
+          (ATy != Loc.getElementType() && IsTrivialAutoVarInitPattern)) {
+        for (unsigned i = 0; i != ATy->getNumElements(); i++) {
+          Address EltPtr = Builder.CreateConstGEP(
+              Loc.withElementType(ATy->getElementType()), i);
+          emitStoresForConstant(CGM, D, EltPtr, isVolatile, Builder,
+                                constant->getAggregateElement(i), IsAutoInit);
+        }
+        return;
       }
-      return;
     }
   }
 
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 8898e3f22a7d..cb5a004e4f4a 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -1341,10 +1341,8 @@ struct SaveRetExprRAII {
 };
 } // namespace
 
-/// If we have 'return f(...);', where both caller and callee are SwiftAsync,
-/// codegen it as 'tail call ...; ret void;'.
-static void makeTailCallIfSwiftAsync(const CallExpr *CE, CGBuilderTy &Builder,
-                                     const CGFunctionInfo *CurFnInfo) {
+/// Determine if the given call uses the swiftasync calling convention.
+static bool isSwiftAsyncCallee(const CallExpr *CE) {
   auto calleeQualType = CE->getCallee()->getType();
   const FunctionType *calleeType = nullptr;
   if (calleeQualType->isFunctionPointerType() ||
@@ -1359,18 +1357,12 @@ static void makeTailCallIfSwiftAsync(const CallExpr *CE, CGBuilderTy &Builder,
       // getMethodDecl() doesn't handle member pointers at the moment.
       calleeType = methodDecl->getType()->castAs<FunctionType>();
     } else {
-      return;
+      return false;
     }
   } else {
-    return;
-  }
-  if (calleeType->getCallConv() == CallingConv::CC_SwiftAsync &&
-      (CurFnInfo->getASTCallingConvention() == CallingConv::CC_SwiftAsync)) {
-    auto CI = cast<llvm::CallInst>(&Builder.GetInsertBlock()->back());
-    CI->setTailCallKind(llvm::CallInst::TCK_MustTail);
-    Builder.CreateRetVoid();
-    Builder.ClearInsertionPoint();
+    return false;
   }
+  return calleeType->getCallConv() == CallingConv::CC_SwiftAsync;
 }
 
 /// EmitReturnStmt - Note that due to GCC extensions, this can have an operand
@@ -1410,6 +1402,19 @@ void CodeGenFunction::EmitReturnStmt(const ReturnStmt &S) {
   RunCleanupsScope cleanupScope(*this);
   if (const auto *EWC = dyn_cast_or_null<ExprWithCleanups>(RV))
     RV = EWC->getSubExpr();
+
+  // If we're in a swiftasynccall function, and the return expression is a
+  // call to a swiftasynccall function, mark the call as the musttail call.
+  std::optional<llvm::SaveAndRestore<const CallExpr *>> SaveMustTail;
+  if (RV && CurFnInfo &&
+      CurFnInfo->getASTCallingConvention() == CallingConv::CC_SwiftAsync) {
+    if (auto CE = dyn_cast<CallExpr>(RV)) {
+      if (isSwiftAsyncCallee(CE)) {
+        SaveMustTail.emplace(MustTailCall, CE);
+      }
+    }
+  }
+
   // FIXME: Clean this up by using an LValue for ReturnTemp,
   // EmitStoreThroughLValue, and EmitAnyExpr.
   // Check if the NRVO candidate was not globalized in OpenMP mode.
@@ -1432,8 +1437,6 @@ void CodeGenFunction::EmitReturnStmt(const ReturnStmt &S) {
     // for side effects.
     if (RV) {
       EmitAnyExpr(RV);
-      if (auto *CE = dyn_cast<CallExpr>(RV))
-        makeTailCallIfSwiftAsync(CE, Builder, CurFnInfo);
     }
   } else if (!RV) {
     // Do nothing (return value is left uninitialized)
diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
index 2291c991fb11..1ec0f159ebcb 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -3019,6 +3019,10 @@ Address X86_64ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
   ABIArgInfo AI = classifyArgumentType(Ty, 0, neededInt, neededSSE,
                                        /*isNamedArg*/false);
 
+  // Empty records are ignored for parameter passing purposes.
+  if (AI.isIgnore())
+    return CGF.CreateMemTemp(Ty);
+
   // AMD64-ABI 3.5.7p5: Step 1. Determine whether type may be passed
   // in the registers. If not go to step 7.
   if (!neededInt && !neededSSE)
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 767c1cd47e8c..7a53764364ce 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -6203,28 +6203,35 @@ std::string Driver::GetStdModuleManifestPath(const Compilation &C,
 
   switch (TC.GetCXXStdlibType(C.getArgs())) {
   case ToolChain::CST_Libcxx: {
-    std::string lib = GetFilePath("libc++.so", TC);
-
-    // Note when there are multiple flavours of libc++ the module json needs to
-    // look at the command-line arguments for the proper json.
-    // These flavours do not exist at the moment, but there are plans to
-    // provide a variant that is built with sanitizer instrumentation enabled.
-
-    // For example
-    //  StringRef modules = [&] {
-    //    const SanitizerArgs &Sanitize = TC.getSanitizerArgs(C.getArgs());
-    //    if (Sanitize.needsAsanRt())
-    //      return "modules-asan.json";
-    //    return "modules.json";
-    //  }();
-
-    SmallString<128> path(lib.begin(), lib.end());
-    llvm::sys::path::remove_filename(path);
-    llvm::sys::path::append(path, "modules.json");
-    if (TC.getVFS().exists(path))
-      return static_cast<std::string>(path);
+    auto evaluate = [&](const char *library) -> std::optional<std::string> {
+      std::string lib = GetFilePath(library, TC);
+
+      // Note when there are multiple flavours of libc++ the module json needs
+      // to look at the command-line arguments for the proper json. These
+      // flavours do not exist at the moment, but there are plans to provide a
+      // variant that is built with sanitizer instrumentation enabled.
+
+      // For example
+      //  StringRef modules = [&] {
+      //    const SanitizerArgs &Sanitize = TC.getSanitizerArgs(C.getArgs());
+      //    if (Sanitize.needsAsanRt())
+      //      return "libc++.modules-asan.json";
+      //    return "libc++.modules.json";
+      //  }();
+
+      SmallString<128> path(lib.begin(), lib.end());
+      llvm::sys::path::remove_filename(path);
+      llvm::sys::path::append(path, "libc++.modules.json");
+      if (TC.getVFS().exists(path))
+        return static_cast<std::string>(path);
+
+      return {};
+    };
 
-    return error;
+    if (std::optional<std::string> result = evaluate("libc++.so"); result)
+      return *result;
+
+    return evaluate("libc++.a").value_or(error);
   }
 
   case ToolChain::CST_Libstdcxx:
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index bc9cc8ce6cf5..86a287db72a4 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5863,8 +5863,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     } else if (Triple.getArch() == llvm::Triple::x86_64) {
       Ok = llvm::is_contained({"small", "kernel", "medium", "large", "tiny"},
                               CM);
-    } else if (Triple.isNVPTX() || Triple.isAMDGPU()) {
-      // NVPTX/AMDGPU does not care about the code model and will accept
+    } else if (Triple.isNVPTX() || Triple.isAMDGPU() || Triple.isSPIRV()) {
+      // NVPTX/AMDGPU/SPIRV does not care about the code model and will accept
       // whatever works for the host.
       Ok = true;
     } else if (Triple.isSPARC64()) {
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index a8882e82e171..be7a0b247e03 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -207,6 +207,8 @@ _mm256_div_ps(__m256 __a, __m256 __b)
 /// Compares two 256-bit vectors of [4 x double] and returns the greater
 ///    of each pair of values.
 ///
+///    If either value in a comparison is NaN, returns the value from \a __b.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
@@ -226,6 +228,8 @@ _mm256_max_pd(__m256d __a, __m256d __b)
 /// Compares two 256-bit vectors of [8 x float] and returns the greater
 ///    of each pair of values.
 ///
+///    If either value in a comparison is NaN, returns the value from \a __b.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
@@ -245,6 +249,8 @@ _mm256_max_ps(__m256 __a, __m256 __b)
 /// Compares two 256-bit vectors of [4 x double] and returns the lesser
 ///    of each pair of values.
 ///
+///    If either value in a comparison is NaN, returns the value from \a __b.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VMINPD </c> instruction.
@@ -264,6 +270,8 @@ _mm256_min_pd(__m256d __a, __m256d __b)
 /// Compares two 256-bit vectors of [8 x float] and returns the lesser
 ///    of each pair of values.
 ///
+///    If either value in a comparison is NaN, returns the value from \a __b.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VMINPS </c> instruction.
@@ -1604,9 +1612,9 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    128-bit vectors of [2 x double], using the operation specified by the
 ///    immediate integer operand.
 ///
-///    Returns a [2 x double] vector consisting of two doubles corresponding to
-///    the two comparison results: zero if the comparison is false, and all 1's
-///    if the comparison is true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, comparisons that are ordered
+///    return false, and comparisons that are unordered return true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1663,9 +1671,9 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    [4 x float], using the operation specified by the immediate integer
 ///    operand.
 ///
-///    Returns a [4 x float] vector consisting of four floats corresponding to
-///    the four comparison results: zero if the comparison is false, and all 1's
-///    if the comparison is true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, comparisons that are ordered
+///    return false, and comparisons that are unordered return true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1721,9 +1729,9 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    256-bit vectors of [4 x double], using the operation specified by the
 ///    immediate integer operand.
 ///
-///    Returns a [4 x double] vector consisting of four doubles corresponding to
-///    the four comparison results: zero if the comparison is false, and all 1's
-///    if the comparison is true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, comparisons that are ordered
+///    return false, and comparisons that are unordered return true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1781,9 +1789,9 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    [8 x float], using the operation specified by the immediate integer
 ///    operand.
 ///
-///    Returns a [8 x float] vector consisting of eight floats corresponding to
-///    the eight comparison results: zero if the comparison is false, and all
-///    1's if the comparison is true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, comparisons that are ordered
+///    return false, and comparisons that are unordered return true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1842,8 +1850,9 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    two 128-bit vectors of [2 x double], using the operation specified by the
 ///    immediate integer operand.
 ///
-///    If the result is true, all 64 bits of the destination vector are set;
-///    otherwise they are cleared.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, comparisons that are ordered
+///    return false, and comparisons that are unordered return true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1900,8 +1909,9 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
 ///    vectors of [4 x float], using the operation specified by the immediate
 ///    integer operand.
 ///
-///    If the result is true, all 32 bits of the destination vector are set;
-///    otherwise they are cleared.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, comparisons that are ordered
+///    return false, and comparisons that are unordered return true.
 ///
 /// \headerfile <x86intrin.h>
 ///
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index f0c2db752195..e85bfc47aa5c 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -259,6 +259,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
 ///    result. The upper 64 bits of the result are copied from the upper
 ///    double-precision value of the first operand.
 ///
+///    If either value in a comparison is NaN, returns the value from \a __b.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
@@ -278,9 +280,11 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
 }
 
 /// Performs element-by-element comparison of the two 128-bit vectors of
-///    [2 x double] and returns the vector containing the lesser of each pair of
+///    [2 x double] and returns a vector containing the lesser of each pair of
 ///    values.
 ///
+///    If either value in a comparison is NaN, returns the value from \a __b.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
@@ -301,6 +305,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
 ///    result. The upper 64 bits of the result are copied from the upper
 ///    double-precision value of the first operand.
 ///
+///    If either value in a comparison is NaN, returns the value from \a __b.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
@@ -320,9 +326,11 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
 }
 
 /// Performs element-by-element comparison of the two 128-bit vectors of
-///    [2 x double] and returns the vector containing the greater of each pair
+///    [2 x double] and returns a vector containing the greater of each pair
 ///    of values.
 ///
+///    If either value in a comparison is NaN, returns the value from \a __b.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
@@ -412,7 +420,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
 /// Compares each of the corresponding double-precision values of the
 ///    128-bit vectors of [2 x double] for equality.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -432,7 +441,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
 ///    128-bit vectors of [2 x double] to determine if the values in the first
 ///    operand are less than those in the second operand.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -452,7 +462,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
 ///    128-bit vectors of [2 x double] to determine if the values in the first
 ///    operand are less than or equal to those in the second operand.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -472,7 +483,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
 ///    128-bit vectors of [2 x double] to determine if the values in the first
 ///    operand are greater than those in the second operand.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -492,7 +504,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
 ///    128-bit vectors of [2 x double] to determine if the values in the first
 ///    operand are greater than or equal to those in the second operand.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -512,8 +525,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
 ///    128-bit vectors of [2 x double] to determine if the values in the first
 ///    operand are ordered with respect to those in the second operand.
 ///
-///    A pair of double-precision values are "ordered" with respect to each
-///    other if neither value is a NaN. Each comparison yields 0x0 for false,
+///    A pair of double-precision values are ordered with respect to each
+///    other if neither value is a NaN. Each comparison returns 0x0 for false,
 ///    0xFFFFFFFFFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
@@ -534,8 +547,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
 ///    128-bit vectors of [2 x double] to determine if the values in the first
 ///    operand are unordered with respect to those in the second operand.
 ///
-///    A pair of double-precision values are "unordered" with respect to each
-///    other if one or both values are NaN. Each comparison yields 0x0 for
+///    A pair of double-precision values are unordered with respect to each
+///    other if one or both values are NaN. Each comparison returns 0x0 for
 ///    false, 0xFFFFFFFFFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
@@ -557,7 +570,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
 ///    128-bit vectors of [2 x double] to determine if the values in the first
 ///    operand are unequal to those in the second operand.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -577,7 +591,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
 ///    128-bit vectors of [2 x double] to determine if the values in the first
 ///    operand are not less than those in the second operand.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -597,7 +612,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
 ///    128-bit vectors of [2 x double] to determine if the values in the first
 ///    operand are not less than or equal to those in the second operand.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -617,7 +633,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
 ///    128-bit vectors of [2 x double] to determine if the values in the first
 ///    operand are not greater than those in the second operand.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -637,7 +654,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
 ///    128-bit vectors of [2 x double] to determine if the values in the first
 ///    operand are not greater than or equal to those in the second operand.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -656,7 +674,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
 /// Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] for equality.
 ///
-///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -680,7 +699,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
 ///    the value in the first parameter is less than the corresponding value in
 ///    the second parameter.
 ///
-///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -704,7 +724,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
 ///    the value in the first parameter is less than or equal to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -728,7 +749,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
 ///    the value in the first parameter is greater than the corresponding value
 ///    in the second parameter.
 ///
-///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -753,7 +775,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
 ///    the value in the first parameter is greater than or equal to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -775,11 +798,11 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
 
 /// Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is "ordered" with respect to the
+///    the value in the first parameter is ordered with respect to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
-///    of double-precision values are "ordered" with respect to each other if
+///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
+///    of double-precision values are ordered with respect to each other if
 ///    neither value is a NaN.
 ///
 /// \headerfile <x86intrin.h>
@@ -801,11 +824,11 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
 
 /// Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
-///    the value in the first parameter is "unordered" with respect to the
+///    the value in the first parameter is unordered with respect to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
-///    of double-precision values are "unordered" with respect to each other if
+///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
+///    of double-precision values are unordered with respect to each other if
 ///    one or both values are NaN.
 ///
 /// \headerfile <x86intrin.h>
@@ -831,7 +854,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
 ///    the value in the first parameter is unequal to the corresponding value in
 ///    the second parameter.
 ///
-///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -855,7 +879,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
 ///    the value in the first parameter is not less than the corresponding
 ///    value in the second parameter.
 ///
-///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -879,7 +904,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
 ///    the value in the first parameter is not less than or equal to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -903,7 +929,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
 ///    the value in the first parameter is not greater than the corresponding
 ///    value in the second parameter.
 ///
-///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -928,7 +955,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
 ///    the value in the first parameter is not greater than or equal to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -951,8 +979,8 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
 /// Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] for equality.
 ///
-///    The comparison returns 0 for false, 1 for true. If either of the two
-///    lower double-precision values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -975,8 +1003,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
 ///    the value in the first parameter is less than the corresponding value in
 ///    the second parameter.
 ///
-///    The comparison returns 0 for false, 1 for true. If either of the two
-///    lower double-precision values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -999,8 +1027,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
 ///    the value in the first parameter is less than or equal to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison returns 0 for false, 1 for true. If either of the two
-///    lower double-precision values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1023,8 +1051,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
 ///    the value in the first parameter is greater than the corresponding value
 ///    in the second parameter.
 ///
-///    The comparison returns 0 for false, 1 for true. If either of the two
-///    lower double-precision values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1047,8 +1075,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
 ///    the value in the first parameter is greater than or equal to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison returns 0 for false, 1 for true. If either of the two
-///    lower double-precision values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1071,8 +1099,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
 ///    the value in the first parameter is unequal to the corresponding value in
 ///    the second parameter.
 ///
-///    The comparison returns 0 for false, 1 for true. If either of the two
-///    lower double-precision values is NaN, 1 is returned.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 1.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1093,8 +1121,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
 /// Compares the lower double-precision floating-point values in each of
 ///    the two 128-bit floating-point vectors of [2 x double] for equality.
 ///
-///    The comparison returns 0 for false, 1 for true. If either of the two
-///    lower double-precision values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1117,8 +1145,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
 ///    the value in the first parameter is less than the corresponding value in
 ///    the second parameter.
 ///
-///    The comparison returns 0 for false, 1 for true. If either of the two
-///    lower double-precision values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1141,8 +1169,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
 ///    the value in the first parameter is less than or equal to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison returns 0 for false, 1 for true. If either of the two
-///    lower double-precision values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1165,8 +1193,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
 ///    the value in the first parameter is greater than the corresponding value
 ///    in the second parameter.
 ///
-///    The comparison returns 0 for false, 1 for true. If either of the two
-///    lower double-precision values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1189,8 +1217,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
 ///    the value in the first parameter is greater than or equal to the
 ///    corresponding value in the second parameter.
 ///
-///    The comparison returns 0 for false, 1 for true.  If either of the two
-///    lower double-precision values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1213,8 +1241,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
 ///    the value in the first parameter is unequal to the corresponding value in
 ///    the second parameter.
 ///
-///    The comparison returns 0 for false, 1 for true. If either of the two
-///    lower double-precision values is NaN, 1 is returned.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 1.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3033,7 +3061,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
 /// Compares each of the corresponding 8-bit values of the 128-bit
 ///    integer vectors for equality.
 ///
-///    Each comparison yields 0x0 for false, 0xFF for true.
+///    Each comparison returns 0x0 for false, 0xFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3052,7 +3080,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
 /// Compares each of the corresponding 16-bit values of the 128-bit
 ///    integer vectors for equality.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFF for true.
+///    Each comparison returns 0x0 for false, 0xFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3071,7 +3099,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
 /// Compares each of the corresponding 32-bit values of the 128-bit
 ///    integer vectors for equality.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3091,7 +3119,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
 ///    integer vectors to determine if the values in the first operand are
 ///    greater than those in the second operand.
 ///
-///    Each comparison yields 0x0 for false, 0xFF for true.
+///    Each comparison returns 0x0 for false, 0xFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3113,7 +3141,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
 ///    128-bit integer vectors to determine if the values in the first operand
 ///    are greater than those in the second operand.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFF for true.
+///    Each comparison returns 0x0 for false, 0xFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3133,7 +3161,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
 ///    128-bit integer vectors to determine if the values in the first operand
 ///    are greater than those in the second operand.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3153,7 +3181,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
 ///    integer vectors to determine if the values in the first operand are less
 ///    than those in the second operand.
 ///
-///    Each comparison yields 0x0 for false, 0xFF for true.
+///    Each comparison returns 0x0 for false, 0xFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3173,7 +3201,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
 ///    128-bit integer vectors to determine if the values in the first operand
 ///    are less than those in the second operand.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFF for true.
+///    Each comparison returns 0x0 for false, 0xFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3193,7 +3221,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
 ///    128-bit integer vectors to determine if the values in the first operand
 ///    are less than those in the second operand.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -4777,7 +4805,9 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
 ///    128-bit vectors of [2 x double], using the operation specified by the
 ///    immediate integer operand.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, comparisons that are ordered
+///    return false, and comparisons that are unordered return true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -4811,7 +4841,9 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
 ///    two 128-bit vectors of [2 x double], using the operation specified by the
 ///    immediate integer operand.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, comparisons that are ordered
+///    return false, and comparisons that are unordered return true.
 ///
 /// \headerfile <x86intrin.h>
 ///
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index b2c68c3b7be9..040194786a27 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -316,6 +316,8 @@ _mm_rsqrt_ps(__m128 __a)
 ///    operands and returns the lesser value in the low-order bits of the
 ///    vector of [4 x float].
 ///
+///    If either value in a comparison is NaN, returns the value from \a __b.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
@@ -338,6 +340,8 @@ _mm_min_ss(__m128 __a, __m128 __b)
 /// Compares two 128-bit vectors of [4 x float] and returns the lesser
 ///    of each pair of values.
 ///
+///    If either value in a comparison is NaN, returns the value from \a __b.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
@@ -358,6 +362,8 @@ _mm_min_ps(__m128 __a, __m128 __b)
 ///    operands and returns the greater value in the low-order bits of a 128-bit
 ///    vector of [4 x float].
 ///
+///    If either value in a comparison is NaN, returns the value from \a __b.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
@@ -380,6 +386,8 @@ _mm_max_ss(__m128 __a, __m128 __b)
 /// Compares two 128-bit vectors of [4 x float] and returns the greater
 ///    of each pair of values.
 ///
+///    If either value in a comparison is NaN, returns the value from \a __b.
+///
 /// \headerfile <x86intrin.h>
 ///
 /// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
@@ -478,6 +486,7 @@ _mm_xor_ps(__m128 __a, __m128 __b)
 ///
 ///    The comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
 ///    low-order bits of a vector [4 x float].
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -501,6 +510,7 @@ _mm_cmpeq_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] for equality.
 ///
 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -523,6 +533,7 @@ _mm_cmpeq_ps(__m128 __a, __m128 __b)
 ///
 ///    The comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
 ///    low-order bits of a vector of [4 x float].
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -547,6 +558,7 @@ _mm_cmplt_ss(__m128 __a, __m128 __b)
 ///    operand are less than those in the second operand.
 ///
 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -569,6 +581,7 @@ _mm_cmplt_ps(__m128 __a, __m128 __b)
 ///
 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true, in
 ///    the low-order bits of a vector of [4 x float].
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -593,6 +606,7 @@ _mm_cmple_ss(__m128 __a, __m128 __b)
 ///    operand are less than or equal to those in the second operand.
 ///
 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -615,6 +629,7 @@ _mm_cmple_ps(__m128 __a, __m128 __b)
 ///
 ///    The comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
 ///    low-order bits of a vector of [4 x float].
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -641,6 +656,7 @@ _mm_cmpgt_ss(__m128 __a, __m128 __b)
 ///    operand are greater than those in the second operand.
 ///
 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -663,6 +679,7 @@ _mm_cmpgt_ps(__m128 __a, __m128 __b)
 ///
 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
 ///    low-order bits of a vector of [4 x float].
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -689,6 +706,7 @@ _mm_cmpge_ss(__m128 __a, __m128 __b)
 ///    operand are greater than or equal to those in the second operand.
 ///
 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns false.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -710,6 +728,7 @@ _mm_cmpge_ps(__m128 __a, __m128 __b)
 ///
 ///    The comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
 ///    low-order bits of a vector of [4 x float].
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -734,6 +753,7 @@ _mm_cmpneq_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] for inequality.
 ///
 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -757,6 +777,7 @@ _mm_cmpneq_ps(__m128 __a, __m128 __b)
 ///
 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
 ///    low-order bits of a vector of [4 x float].
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -782,6 +803,7 @@ _mm_cmpnlt_ss(__m128 __a, __m128 __b)
 ///    operand are not less than those in the second operand.
 ///
 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -805,6 +827,7 @@ _mm_cmpnlt_ps(__m128 __a, __m128 __b)
 ///
 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
 ///    low-order bits of a vector of [4 x float].
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -830,6 +853,7 @@ _mm_cmpnle_ss(__m128 __a, __m128 __b)
 ///    operand are not less than or equal to those in the second operand.
 ///
 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -853,6 +877,7 @@ _mm_cmpnle_ps(__m128 __a, __m128 __b)
 ///
 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
 ///    low-order bits of a vector of [4 x float].
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -880,6 +905,7 @@ _mm_cmpngt_ss(__m128 __a, __m128 __b)
 ///    operand are not greater than those in the second operand.
 ///
 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -903,6 +929,7 @@ _mm_cmpngt_ps(__m128 __a, __m128 __b)
 ///
 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
 ///    low-order bits of a vector of [4 x float].
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -930,6 +957,7 @@ _mm_cmpnge_ss(__m128 __a, __m128 __b)
 ///    operand are not greater than or equal to those in the second operand.
 ///
 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, returns true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -951,8 +979,9 @@ _mm_cmpnge_ps(__m128 __a, __m128 __b)
 ///    operands to determine if the value in the first operand is ordered with
 ///    respect to the corresponding value in the second operand.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
-///    low-order bits of a vector of [4 x float].
+///    A pair of floating-point values are ordered with respect to each
+///    other if neither value is a NaN. Each comparison returns 0x0 for false,
+///    0xFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -977,7 +1006,9 @@ _mm_cmpord_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are ordered with respect to those in the second operand.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///    A pair of floating-point values are ordered with respect to each
+///    other if neither value is a NaN. Each comparison returns 0x0 for false,
+///    0xFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -999,8 +1030,9 @@ _mm_cmpord_ps(__m128 __a, __m128 __b)
 ///    operands to determine if the value in the first operand is unordered
 ///    with respect to the corresponding value in the second operand.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true, in the
-///    low-order bits of a vector of [4 x float].
+///    A pair of double-precision values are unordered with respect to each
+///    other if one or both values are NaN. Each comparison returns 0x0 for
+///    false, 0xFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1025,7 +1057,9 @@ _mm_cmpunord_ss(__m128 __a, __m128 __b)
 ///    128-bit vectors of [4 x float] to determine if the values in the first
 ///    operand are unordered with respect to those in the second operand.
 ///
-///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///    A pair of double-precision values are unordered with respect to each
+///    other if one or both values are NaN. Each comparison returns 0x0 for
+///    false, 0xFFFFFFFFFFFFFFFF for true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1046,8 +1080,8 @@ _mm_cmpunord_ps(__m128 __a, __m128 __b)
 /// Compares two 32-bit float values in the low-order bits of both
 ///    operands for equality.
 ///
-///    The comparison returns 0 for false, 1 for true. If either of the two
-///    lower floating-point values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1071,8 +1105,8 @@ _mm_comieq_ss(__m128 __a, __m128 __b)
 ///    operands to determine if the first operand is less than the second
 ///    operand.
 ///
-///    The comparison returns 0 for false, 1 for true. If either of the two
-///    lower floating-point values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1096,8 +1130,8 @@ _mm_comilt_ss(__m128 __a, __m128 __b)
 ///    operands to determine if the first operand is less than or equal to the
 ///    second operand.
 ///
-///    The comparison returns 0 for false, 1 for true. If either of the two
-///    lower floating-point values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1120,8 +1154,8 @@ _mm_comile_ss(__m128 __a, __m128 __b)
 ///    operands to determine if the first operand is greater than the second
 ///    operand.
 ///
-///    The comparison returns 0 for false, 1 for true. If either of the two
-///    lower floating-point values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1144,8 +1178,8 @@ _mm_comigt_ss(__m128 __a, __m128 __b)
 ///    operands to determine if the first operand is greater than or equal to
 ///    the second operand.
 ///
-///    The comparison returns 0 for false, 1 for true. If either of the two
-///    lower floating-point values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1168,8 +1202,8 @@ _mm_comige_ss(__m128 __a, __m128 __b)
 ///    operands to determine if the first operand is not equal to the second
 ///    operand.
 ///
-///    The comparison returns 0 for false, 1 for true. If either of the two
-///    lower floating-point values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 1.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1191,8 +1225,8 @@ _mm_comineq_ss(__m128 __a, __m128 __b)
 /// Performs an unordered comparison of two 32-bit float values using
 ///    the low-order bits of both operands to determine equality.
 ///
-///    The comparison returns 0 for false, 1 for true.  If either of the two
-///    lower floating-point values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1215,8 +1249,8 @@ _mm_ucomieq_ss(__m128 __a, __m128 __b)
 ///    the low-order bits of both operands to determine if the first operand is
 ///    less than the second operand.
 ///
-///    The comparison returns 0 for false, 1 for true.  If either of the two
-///    lower floating-point values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1239,8 +1273,8 @@ _mm_ucomilt_ss(__m128 __a, __m128 __b)
 ///    the low-order bits of both operands to determine if the first operand is
 ///    less than or equal to the second operand.
 ///
-///    The comparison returns 0 for false, 1 for true.  If either of the two
-///    lower floating-point values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1263,8 +1297,8 @@ _mm_ucomile_ss(__m128 __a, __m128 __b)
 ///    the low-order bits of both operands to determine if the first operand is
 ///    greater than the second operand.
 ///
-///    The comparison returns 0 for false, 1 for true.  If either of the two
-///    lower floating-point values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1287,8 +1321,8 @@ _mm_ucomigt_ss(__m128 __a, __m128 __b)
 ///    the low-order bits of both operands to determine if the first operand is
 ///    greater than or equal to the second operand.
 ///
-///    The comparison returns 0 for false, 1 for true.  If either of the two
-///    lower floating-point values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -1310,8 +1344,8 @@ _mm_ucomige_ss(__m128 __a, __m128 __b)
 /// Performs an unordered comparison of two 32-bit float values using
 ///    the low-order bits of both operands to determine inequality.
 ///
-///    The comparison returns 0 for false, 1 for true.  If either of the two
-///    lower floating-point values is NaN, returns 0.
+///    The comparison returns 0 for false, 1 for true. If either value in a
+///    comparison is NaN, returns 0.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3028,6 +3062,8 @@ _mm_movemask_ps(__m128 __a)
 ///    operand.
 ///
 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, comparisons that are ordered
+///    return false, and comparisons that are unordered return true.
 ///
 /// \headerfile <x86intrin.h>
 ///
@@ -3061,6 +3097,8 @@ _mm_movemask_ps(__m128 __a)
 ///    integer operand.
 ///
 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///    If either value in a comparison is NaN, comparisons that are ordered
+///    return false, and comparisons that are unordered return true.
 ///
 /// \headerfile <x86intrin.h>
 ///
diff --git a/clang/lib/InstallAPI/DylibVerifier.cpp b/clang/lib/InstallAPI/DylibVerifier.cpp
index 24e0d0addf2f..94b8e9cd3233 100644
--- a/clang/lib/InstallAPI/DylibVerifier.cpp
+++ b/clang/lib/InstallAPI/DylibVerifier.cpp
@@ -66,17 +66,15 @@ std::string DylibVerifier::getAnnotatedName(const Record *R,
     Annotation += "(tlv) ";
 
   // Check if symbol represents only part of a @interface declaration.
-  const bool IsAnnotatedObjCClass =
-      ((SymCtx.ObjCIFKind != ObjCIFSymbolKind::None) &&
-       (SymCtx.ObjCIFKind <= ObjCIFSymbolKind::EHType));
-
-  if (IsAnnotatedObjCClass) {
-    if (SymCtx.ObjCIFKind == ObjCIFSymbolKind::EHType)
-      Annotation += "Exception Type of ";
-    if (SymCtx.ObjCIFKind == ObjCIFSymbolKind::MetaClass)
-      Annotation += "Metaclass of ";
-    if (SymCtx.ObjCIFKind == ObjCIFSymbolKind::Class)
-      Annotation += "Class of ";
+  switch (SymCtx.ObjCIFKind) {
+  default:
+    break;
+  case ObjCIFSymbolKind::EHType:
+    return Annotation + "Exception Type of " + PrettyName;
+  case ObjCIFSymbolKind::MetaClass:
+    return Annotation + "Metaclass of " + PrettyName;
+  case ObjCIFSymbolKind::Class:
+    return Annotation + "Class of " + PrettyName;
   }
 
   // Only print symbol type prefix or leading "_" if there is no source location
@@ -90,9 +88,6 @@ std::string DylibVerifier::getAnnotatedName(const Record *R,
     return Annotation + PrettyName;
   }
 
-  if (IsAnnotatedObjCClass)
-    return Annotation + PrettyName;
-
   switch (SymCtx.Kind) {
   case EncodeKind::GlobalSymbol:
     return Annotation + PrettyName;
@@ -332,9 +327,9 @@ bool DylibVerifier::compareSymbolFlags(const Record *R, SymbolContext &SymCtx,
   }
   if (!DR->isThreadLocalValue() && R->isThreadLocalValue()) {
     Ctx.emitDiag([&]() {
-      SymCtx.FA->D->getLocation(),
-          Ctx.Diag->Report(diag::err_header_symbol_flags_mismatch)
-              << getAnnotatedName(DR, SymCtx) << R->isThreadLocalValue();
+      Ctx.Diag->Report(SymCtx.FA->D->getLocation(),
+                       diag::err_header_symbol_flags_mismatch)
+          << getAnnotatedName(R, SymCtx) << R->isThreadLocalValue();
     });
     return false;
   }
@@ -520,5 +515,147 @@ void DylibVerifier::VerifierContext::emitDiag(
   Report();
 }
 
+// The existence of weak-defined RTTI can not always be inferred from the
+// header files because they can be generated as part of an implementation
+// file.
+// InstallAPI doesn't warn about weak-defined RTTI, because this doesn't affect
+// static linking and so can be ignored for text-api files.
+static bool shouldIgnoreCpp(StringRef Name, bool IsWeakDef) {
+  return (IsWeakDef &&
+          (Name.starts_with("__ZTI") || Name.starts_with("__ZTS")));
+}
+void DylibVerifier::visitSymbolInDylib(const Record &R, SymbolContext &SymCtx) {
+  // Undefined symbols should not be in InstallAPI generated text-api files.
+  if (R.isUndefined()) {
+    updateState(Result::Valid);
+    return;
+  }
+
+  // Internal symbols should not be in InstallAPI generated text-api files.
+  if (R.isInternal()) {
+    updateState(Result::Valid);
+    return;
+  }
+
+  // Allow zippered symbols with potentially mismatching availability
+  // between macOS and macCatalyst in the final text-api file.
+  const StringRef SymbolName(SymCtx.SymbolName);
+  if (const Symbol *Sym = Exports->findSymbol(SymCtx.Kind, SymCtx.SymbolName,
+                                              SymCtx.ObjCIFKind)) {
+    if (Sym->hasArchitecture(Ctx.Target.Arch)) {
+      updateState(Result::Ignore);
+      return;
+    }
+  }
+
+  if (shouldIgnoreCpp(SymbolName, R.isWeakDefined())) {
+    updateState(Result::Valid);
+    return;
+  }
+
+  // All checks at this point classify as some kind of violation that should be
+  // reported.
+
+  // Regardless of verification mode, error out on mismatched special linker
+  // symbols.
+  if (SymbolName.starts_with("$ld$")) {
+    Ctx.emitDiag([&]() {
+      Ctx.Diag->Report(diag::err_header_symbol_missing)
+          << getAnnotatedName(&R, SymCtx, /*ValidSourceLoc=*/false);
+    });
+    updateState(Result::Invalid);
+    return;
+  }
+
+  // Missing declarations for exported symbols are hard errors on Pedantic mode.
+  if (Mode == VerificationMode::Pedantic) {
+    Ctx.emitDiag([&]() {
+      Ctx.Diag->Report(diag::err_header_symbol_missing)
+          << getAnnotatedName(&R, SymCtx, /*ValidSourceLoc=*/false);
+    });
+    updateState(Result::Invalid);
+    return;
+  }
+
+  // Missing declarations for exported symbols are warnings on ErrorsAndWarnings
+  // mode.
+  if (Mode == VerificationMode::ErrorsAndWarnings) {
+    Ctx.emitDiag([&]() {
+      Ctx.Diag->Report(diag::warn_header_symbol_missing)
+          << getAnnotatedName(&R, SymCtx, /*ValidSourceLoc=*/false);
+    });
+    updateState(Result::Ignore);
+    return;
+  }
+
+  // Missing declarations are dropped for ErrorsOnly mode. It is the last
+  // remaining mode.
+  updateState(Result::Ignore);
+  return;
+}
+
+void DylibVerifier::visitGlobal(const GlobalRecord &R) {
+  if (R.isVerified())
+    return;
+  SymbolContext SymCtx;
+  SimpleSymbol Sym = parseSymbol(R.getName());
+  SymCtx.SymbolName = Sym.Name;
+  SymCtx.Kind = Sym.Kind;
+  visitSymbolInDylib(R, SymCtx);
+}
+
+void DylibVerifier::visitObjCIVar(const ObjCIVarRecord &R,
+                                  const StringRef Super) {
+  if (R.isVerified())
+    return;
+  SymbolContext SymCtx;
+  SymCtx.SymbolName = ObjCIVarRecord::createScopedName(Super, R.getName());
+  SymCtx.Kind = EncodeKind::ObjectiveCInstanceVariable;
+  visitSymbolInDylib(R, SymCtx);
+}
+
+void DylibVerifier::visitObjCInterface(const ObjCInterfaceRecord &R) {
+  if (R.isVerified())
+    return;
+  SymbolContext SymCtx;
+  SymCtx.SymbolName = R.getName();
+  SymCtx.ObjCIFKind = assignObjCIFSymbolKind(&R);
+  if (SymCtx.ObjCIFKind > ObjCIFSymbolKind::EHType) {
+    if (R.hasExceptionAttribute()) {
+      SymCtx.Kind = EncodeKind::ObjectiveCClassEHType;
+      visitSymbolInDylib(R, SymCtx);
+    }
+    SymCtx.Kind = EncodeKind::ObjectiveCClass;
+    visitSymbolInDylib(R, SymCtx);
+  } else {
+    SymCtx.Kind = R.hasExceptionAttribute() ? EncodeKind::ObjectiveCClassEHType
+                                            : EncodeKind::ObjectiveCClass;
+    visitSymbolInDylib(R, SymCtx);
+  }
+
+  for (const ObjCIVarRecord *IV : R.getObjCIVars())
+    visitObjCIVar(*IV, R.getName());
+}
+
+void DylibVerifier::visitObjCCategory(const ObjCCategoryRecord &R) {
+  for (const ObjCIVarRecord *IV : R.getObjCIVars())
+    visitObjCIVar(*IV, R.getSuperClassName());
+}
+
+DylibVerifier::Result DylibVerifier::verifyRemainingSymbols() {
+  if (getState() == Result::NoVerify)
+    return Result::NoVerify;
+  assert(!Dylib.empty() && "No binary to verify against");
+
+  Ctx.DiscoveredFirstError = false;
+  Ctx.PrintArch = true;
+  for (std::shared_ptr<RecordsSlice> Slice : Dylib) {
+    Ctx.Target = Slice->getTarget();
+    Ctx.DylibSlice = Slice.get();
+    Slice->visit(*this);
+  }
+  return getState();
+}
+
 } // namespace installapi
 } // namespace clang
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index 88c3a1469e8e..ae23cb432c43 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -1823,7 +1823,7 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind,
     }
     goto ExpectedExpression;
   case tok::l_square:
-    if (getLangOpts().CPlusPlus11) {
+    if (getLangOpts().CPlusPlus) {
       if (getLangOpts().ObjC) {
         // C++11 lambda expressions and Objective-C message sends both start with a
         // square bracket.  There are three possibilities here:
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index 9471f6f725ef..73c85c585baa 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -806,9 +806,8 @@ ExprResult Parser::ParseLambdaExpression() {
 ///
 /// If we are not looking at a lambda expression, returns ExprError().
 ExprResult Parser::TryParseLambdaExpression() {
-  assert(getLangOpts().CPlusPlus11
-         && Tok.is(tok::l_square)
-         && "Not at the start of a possible lambda expression.");
+  assert(getLangOpts().CPlusPlus && Tok.is(tok::l_square) &&
+         "Not at the start of a possible lambda expression.");
 
   const Token Next = NextToken();
   if (Next.is(tok::eof)) // Nothing else to lookup here...
@@ -1326,7 +1325,9 @@ static void DiagnoseStaticSpecifierRestrictions(Parser &P,
 ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
                      LambdaIntroducer &Intro) {
   SourceLocation LambdaBeginLoc = Intro.Range.getBegin();
-  Diag(LambdaBeginLoc, diag::warn_cxx98_compat_lambda);
+  Diag(LambdaBeginLoc, getLangOpts().CPlusPlus11
+                           ? diag::warn_cxx98_compat_lambda
+                           : diag::ext_lambda);
 
   PrettyStackTraceLoc CrashInfo(PP.getSourceManager(), LambdaBeginLoc,
                                 "lambda expression parsing");
diff --git a/clang/lib/Parse/ParseInit.cpp b/clang/lib/Parse/ParseInit.cpp
index 637f21176792..423497bfcb66 100644
--- a/clang/lib/Parse/ParseInit.cpp
+++ b/clang/lib/Parse/ParseInit.cpp
@@ -35,7 +35,7 @@ bool Parser::MayBeDesignationStart() {
     return true;
 
   case tok::l_square: {  // designator: array-designator
-    if (!PP.getLangOpts().CPlusPlus11)
+    if (!PP.getLangOpts().CPlusPlus)
       return true;
 
     // C++11 lambda expressions and C99 designators can be ambiguous all the
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index ef3ab16ba29b..246e3577809a 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2399,6 +2399,48 @@ static bool SemaBuiltinPopcountg(Sema &S, CallExpr *TheCall) {
   return false;
 }
 
+/// Checks that __builtin_{clzg,ctzg} was called with a first argument, which is
+/// an unsigned integer, and an optional second argument, which is promoted to
+/// an 'int'.
+static bool SemaBuiltinCountZeroBitsGeneric(Sema &S, CallExpr *TheCall) {
+  if (checkArgCountRange(S, TheCall, 1, 2))
+    return true;
+
+  ExprResult Arg0Res = S.DefaultLvalueConversion(TheCall->getArg(0));
+  if (Arg0Res.isInvalid())
+    return true;
+
+  Expr *Arg0 = Arg0Res.get();
+  TheCall->setArg(0, Arg0);
+
+  QualType Arg0Ty = Arg0->getType();
+
+  if (!Arg0Ty->isUnsignedIntegerType()) {
+    S.Diag(Arg0->getBeginLoc(), diag::err_builtin_invalid_arg_type)
+        << 1 << /*unsigned integer ty*/ 7 << Arg0Ty;
+    return true;
+  }
+
+  if (TheCall->getNumArgs() > 1) {
+    ExprResult Arg1Res = S.UsualUnaryConversions(TheCall->getArg(1));
+    if (Arg1Res.isInvalid())
+      return true;
+
+    Expr *Arg1 = Arg1Res.get();
+    TheCall->setArg(1, Arg1);
+
+    QualType Arg1Ty = Arg1->getType();
+
+    if (!Arg1Ty->isSpecificBuiltinType(BuiltinType::Int)) {
+      S.Diag(Arg1->getBeginLoc(), diag::err_builtin_invalid_arg_type)
+          << 2 << /*'int' ty*/ 8 << Arg1Ty;
+      return true;
+    }
+  }
+
+  return false;
+}
+
 ExprResult
 Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
                                CallExpr *TheCall) {
@@ -3187,6 +3229,11 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     if (SemaBuiltinPopcountg(*this, TheCall))
       return ExprError();
     break;
+  case Builtin::BI__builtin_clzg:
+  case Builtin::BI__builtin_ctzg:
+    if (SemaBuiltinCountZeroBitsGeneric(*this, TheCall))
+      return ExprError();
+    break;
   }
 
   if (getLangOpts().HLSL && CheckHLSLBuiltinFunctionCall(BuiltinID, TheCall))
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 8ceb79555fb5..aa754d47a0c4 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -16109,7 +16109,7 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *dcl, Stmt *Body,
               FD->setInvalidDecl();
           }
         }
-      } else if (getLangOpts().CPlusPlus11 && isLambdaCallOperator(FD)) {
+      } else if (getLangOpts().CPlusPlus && isLambdaCallOperator(FD)) {
         // In C++11, we don't use 'auto' deduction rules for lambda call
         // operators because we don't support return type deduction.
         auto *LSI = getCurLambda();
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index e258a4f7c894..ee732679417e 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -9738,7 +9738,8 @@ bool Sema::ShouldDeleteSpecialMember(CXXMethodDecl *MD, CXXSpecialMember CSM,
     return false;
   CXXRecordDecl *RD = MD->getParent();
   assert(!RD->isDependentType() && "do deletion after instantiation");
-  if (!LangOpts.CPlusPlus11 || RD->isInvalidDecl())
+  if (!LangOpts.CPlusPlus || (!LangOpts.CPlusPlus11 && !RD->isLambda()) ||
+      RD->isInvalidDecl())
     return false;
 
   // C++11 [expr.lambda.prim]p19:
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 7b14323b0674..d7521a5363a3 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -1018,6 +1018,11 @@ static QualType applyObjCTypeArgs(Sema &S, SourceLocation loc, QualType type,
       return type;
     }
 
+    // Types that have __attribute__((NSObject)) are permitted.
+    if (typeArg->isObjCNSObjectType()) {
+      continue;
+    }
+
     // Dependent types will be checked at instantiation time.
     if (typeArg->isDependentType()) {
       continue;
diff --git a/clang/lib/StaticAnalyzer/Core/MemRegion.cpp b/clang/lib/StaticAnalyzer/Core/MemRegion.cpp
index 16db6b249dc9..a8e573f7982b 100644
--- a/clang/lib/StaticAnalyzer/Core/MemRegion.cpp
+++ b/clang/lib/StaticAnalyzer/Core/MemRegion.cpp
@@ -720,13 +720,21 @@ std::string MemRegion::getDescriptiveName(bool UseQuotes) const {
       CI->getValue().toString(Idx);
       ArrayIndices = (llvm::Twine("[") + Idx.str() + "]" + ArrayIndices).str();
     }
-    // If not a ConcreteInt, try to obtain the variable
-    // name by calling 'getDescriptiveName' recursively.
+    // Index is symbolic, but may have a descriptive name.
     else {
-      std::string Idx = ER->getDescriptiveName(false);
-      if (!Idx.empty()) {
-        ArrayIndices = (llvm::Twine("[") + Idx + "]" + ArrayIndices).str();
-      }
+      auto SI = ER->getIndex().getAs<nonloc::SymbolVal>();
+      if (!SI)
+        return "";
+
+      const MemRegion *OR = SI->getAsSymbol()->getOriginRegion();
+      if (!OR)
+        return "";
+
+      std::string Idx = OR->getDescriptiveName(false);
+      if (Idx.empty())
+        return "";
+
+      ArrayIndices = (llvm::Twine("[") + Idx + "]" + ArrayIndices).str();
     }
     R = ER->getSuperRegion();
   }
diff --git a/clang/lib/StaticAnalyzer/Core/ProgramState.cpp b/clang/lib/StaticAnalyzer/Core/ProgramState.cpp
index f12f1a5ac970..f82cd944750a 100644
--- a/clang/lib/StaticAnalyzer/Core/ProgramState.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ProgramState.cpp
@@ -226,6 +226,20 @@ ProgramStateRef ProgramState::killBinding(Loc LV) const {
   return makeWithStore(newStore);
 }
 
+/// SymbolicRegions are expected to be wrapped by an ElementRegion as a
+/// canonical representation. As a canonical representation, SymbolicRegions
+/// should be wrapped by ElementRegions before getting a FieldRegion.
+/// See f8643a9b31c4029942f67d4534c9139b45173504 why.
+SVal ProgramState::wrapSymbolicRegion(SVal Val) const {
+  const auto *BaseReg = dyn_cast_or_null<SymbolicRegion>(Val.getAsRegion());
+  if (!BaseReg)
+    return Val;
+
+  StoreManager &SM = getStateManager().getStoreManager();
+  QualType ElemTy = BaseReg->getPointeeStaticType();
+  return loc::MemRegionVal{SM.GetElementZeroRegion(BaseReg, ElemTy)};
+}
+
 ProgramStateRef
 ProgramState::enterStackFrame(const CallEvent &Call,
                               const StackFrameContext *CalleeCtx) const {
@@ -451,6 +465,24 @@ void ProgramState::setStore(const StoreRef &newStore) {
   store = newStoreStore;
 }
 
+SVal ProgramState::getLValue(const FieldDecl *D, SVal Base) const {
+  Base = wrapSymbolicRegion(Base);
+  return getStateManager().StoreMgr->getLValueField(D, Base);
+}
+
+SVal ProgramState::getLValue(const IndirectFieldDecl *D, SVal Base) const {
+  StoreManager &SM = *getStateManager().StoreMgr;
+  Base = wrapSymbolicRegion(Base);
+
+  // FIXME: This should work with `SM.getLValueField(D->getAnonField(), Base)`,
+  // but that would break some tests. There is probably a bug somewhere that it
+  // would expose.
+  for (const auto *I : D->chain()) {
+    Base = SM.getLValueField(cast<FieldDecl>(I), Base);
+  }
+  return Base;
+}
+
 //===----------------------------------------------------------------------===//
 //  State pretty-printing.
 //===----------------------------------------------------------------------===//
diff --git a/clang/test/Analysis/inlining/false-positive-suppression.cpp b/clang/test/Analysis/inlining/false-positive-suppression.cpp
index 56659b4a1941..2f9ed7f78b3f 100644
--- a/clang/test/Analysis/inlining/false-positive-suppression.cpp
+++ b/clang/test/Analysis/inlining/false-positive-suppression.cpp
@@ -210,3 +210,20 @@ namespace Cleanups {
     testArgumentHelper(NonTrivial().getNull());
   }
 }
+
+class Bear *getNullBear() { return nullptr; }
+class Bear {
+public:
+  void brum() const;
+};
+class Door {
+public:
+  Door() : ptr(getNullBear()) {
+    ptr->brum();
+#ifndef SUPPRESSED
+    // expected-warning@-2 {{Called C++ object pointer is null}}
+#endif
+  }
+private:
+  Bear* ptr;
+};
diff --git a/clang/test/CodeGen/aapcs-align.cpp b/clang/test/CodeGen/aapcs-align.cpp
index 2886a32974b0..4f393d9e6b7f 100644
--- a/clang/test/CodeGen/aapcs-align.cpp
+++ b/clang/test/CodeGen/aapcs-align.cpp
@@ -134,8 +134,8 @@ void g6() {
   f6m(1, 2, 3, 4, 5, s);
 }
 // CHECK: define{{.*}} void @g6
-// CHECK: call void @f6(i32 noundef 1, [4 x i32] [i32 6, i32 7, i32 0, i32 0])
-// CHECK: call void @f6m(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, [4 x i32] [i32 6, i32 7, i32 0, i32 0])
+// CHECK: call void @f6(i32 noundef 1, [4 x i32] [i32 6, i32 7, i32 0, i32 undef])
+// CHECK: call void @f6m(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, [4 x i32] [i32 6, i32 7, i32 0, i32 undef])
 // CHECK: declare void @f6(i32 noundef, [4 x i32])
 // CHECK: declare void @f6m(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, [4 x i32])
 }
diff --git a/clang/test/CodeGen/aapcs64-align.cpp b/clang/test/CodeGen/aapcs64-align.cpp
index 759413cbc4b5..de231f2123b9 100644
--- a/clang/test/CodeGen/aapcs64-align.cpp
+++ b/clang/test/CodeGen/aapcs64-align.cpp
@@ -75,8 +75,8 @@ void g4() {
   f4m(1, 2, 3, 4, 5, s);
 }
 // CHECK: define{{.*}} void @g4()
-// CHECK: call void @f4(i32 noundef 1, [2 x i64] %{{.*}})
-// CHECK: void @f4m(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, [2 x i64] %{{.*}})
+// CHECK: call void @f4(i32 noundef 1, [2 x i64] [i64 30064771078, i64 0])
+// CHECK: void @f4m(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, [2 x i64] [i64 30064771078, i64 0])
 // CHECK: declare void @f4(i32 noundef, [2 x i64])
 // CHECK: declare void @f4m(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, [2 x i64])
 
@@ -95,8 +95,8 @@ void f5m(int, int, int, int, int, P16);
     f5m(1, 2, 3, 4, 5, s);
 }
 // CHECK: define{{.*}} void @g5()
-// CHECK: call void @f5(i32 noundef 1, [2 x i64] %{{.*}})
-// CHECK: void @f5m(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, [2 x i64] %{{.*}})
+// CHECK: call void @f5(i32 noundef 1, [2 x i64] [i64 30064771078, i64 0])
+// CHECK: void @f5m(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, [2 x i64] [i64 30064771078, i64 0])
 // CHECK: declare void @f5(i32 noundef, [2 x i64])
 // CHECK: declare void @f5m(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, [2 x i64])
 
diff --git a/clang/test/CodeGen/attr-counted-by.c b/clang/test/CodeGen/attr-counted-by.c
index e5685e39173b..1fb39f9a3466 100644
--- a/clang/test/CodeGen/attr-counted-by.c
+++ b/clang/test/CodeGen/attr-counted-by.c
@@ -1314,17 +1314,10 @@ int test14(int idx) {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test15(
 // NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR4]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[FOO:%.*]] = alloca [[STRUCT_ANON_8:%.*]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[FOO]]) #[[ATTR12]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 1, ptr [[FOO]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 2, ptr [[TMP0]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[BLAH:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[BLAH]], i64 0, i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[FOO]]) #[[ATTR12]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP1]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr getelementptr inbounds ([[STRUCT_ANON_8:%.*]], ptr @__const.test15.foo, i64 1, i32 0), i64 0, i64 [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP0]]
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test15(
 // SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] {
@@ -1342,17 +1335,10 @@ int test14(int idx) {
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test15(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[FOO:%.*]] = alloca [[STRUCT_ANON_8:%.*]], align 4
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[FOO]]) #[[ATTR9]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 1, ptr [[FOO]], align 4
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 4
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 2, ptr [[TMP0]], align 4
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[BLAH:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 8
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[BLAH]], i64 0, i64 [[IDXPROM]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[FOO]]) #[[ATTR9]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP1]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr getelementptr inbounds ([[STRUCT_ANON_8:%.*]], ptr @__const.test15.foo, i64 1, i32 0), i64 0, i64 [[IDXPROM]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP0]]
 //
 int test15(int idx) {
   struct {
diff --git a/clang/test/CodeGen/builtins.c b/clang/test/CodeGen/builtins.c
index 4f9641d357b7..407e0857d223 100644
--- a/clang/test/CodeGen/builtins.c
+++ b/clang/test/CodeGen/builtins.c
@@ -983,4 +983,208 @@ void test_builtin_popcountg(unsigned char uc, unsigned short us,
   // CHECK-NEXT: ret void
 }
 
+// CHECK-LABEL: define{{.*}} void @test_builtin_clzg
+void test_builtin_clzg(unsigned char uc, unsigned short us, unsigned int ui,
+                       unsigned long ul, unsigned long long ull,
+                       unsigned __int128 ui128, unsigned _BitInt(128) ubi128,
+                       signed char sc, short s, int i) {
+  volatile int lz;
+  lz = __builtin_clzg(uc);
+  // CHECK: %1 = load i8, ptr %uc.addr, align 1
+  // CHECK-NEXT: %2 = call i8 @llvm.ctlz.i8(i8 %1, i1 true)
+  // CHECK-NEXT: %cast = sext i8 %2 to i32
+  // CHECK-NEXT: store volatile i32 %cast, ptr %lz, align 4
+  lz = __builtin_clzg(us);
+  // CHECK-NEXT: %3 = load i16, ptr %us.addr, align 2
+  // CHECK-NEXT: %4 = call i16 @llvm.ctlz.i16(i16 %3, i1 true)
+  // CHECK-NEXT: %cast1 = sext i16 %4 to i32
+  // CHECK-NEXT: store volatile i32 %cast1, ptr %lz, align 4
+  lz = __builtin_clzg(ui);
+  // CHECK-NEXT: %5 = load i32, ptr %ui.addr, align 4
+  // CHECK-NEXT: %6 = call i32 @llvm.ctlz.i32(i32 %5, i1 true)
+  // CHECK-NEXT: store volatile i32 %6, ptr %lz, align 4
+  lz = __builtin_clzg(ul);
+  // CHECK-NEXT: %7 = load i64, ptr %ul.addr, align 8
+  // CHECK-NEXT: %8 = call i64 @llvm.ctlz.i64(i64 %7, i1 true)
+  // CHECK-NEXT: %cast2 = trunc i64 %8 to i32
+  // CHECK-NEXT: store volatile i32 %cast2, ptr %lz, align 4
+  lz = __builtin_clzg(ull);
+  // CHECK-NEXT: %9 = load i64, ptr %ull.addr, align 8
+  // CHECK-NEXT: %10 = call i64 @llvm.ctlz.i64(i64 %9, i1 true)
+  // CHECK-NEXT: %cast3 = trunc i64 %10 to i32
+  // CHECK-NEXT: store volatile i32 %cast3, ptr %lz, align 4
+  lz = __builtin_clzg(ui128);
+  // CHECK-NEXT: %11 = load i128, ptr %ui128.addr, align 16
+  // CHECK-NEXT: %12 = call i128 @llvm.ctlz.i128(i128 %11, i1 true)
+  // CHECK-NEXT: %cast4 = trunc i128 %12 to i32
+  // CHECK-NEXT: store volatile i32 %cast4, ptr %lz, align 4
+  lz = __builtin_clzg(ubi128);
+  // CHECK-NEXT: %13 = load i128, ptr %ubi128.addr, align 8
+  // CHECK-NEXT: %14 = call i128 @llvm.ctlz.i128(i128 %13, i1 true)
+  // CHECK-NEXT: %cast5 = trunc i128 %14 to i32
+  // CHECK-NEXT: store volatile i32 %cast5, ptr %lz, align 4
+  lz = __builtin_clzg(uc, sc);
+  // CHECK-NEXT: %15 = load i8, ptr %uc.addr, align 1
+  // CHECK-NEXT: %16 = call i8 @llvm.ctlz.i8(i8 %15, i1 true)
+  // CHECK-NEXT: %cast6 = sext i8 %16 to i32
+  // CHECK-NEXT: %iszero = icmp eq i8 %15, 0
+  // CHECK-NEXT: %17 = load i8, ptr %sc.addr, align 1
+  // CHECK-NEXT: %conv = sext i8 %17 to i32
+  // CHECK-NEXT: %clzg = select i1 %iszero, i32 %conv, i32 %cast6
+  // CHECK-NEXT: store volatile i32 %clzg, ptr %lz, align 4
+  lz = __builtin_clzg(us, uc);
+  // CHECK-NEXT: %18 = load i16, ptr %us.addr, align 2
+  // CHECK-NEXT: %19 = call i16 @llvm.ctlz.i16(i16 %18, i1 true)
+  // CHECK-NEXT: %cast7 = sext i16 %19 to i32
+  // CHECK-NEXT: %iszero8 = icmp eq i16 %18, 0
+  // CHECK-NEXT: %20 = load i8, ptr %uc.addr, align 1
+  // CHECK-NEXT: %conv9 = zext i8 %20 to i32
+  // CHECK-NEXT: %clzg10 = select i1 %iszero8, i32 %conv9, i32 %cast7
+  // CHECK-NEXT: store volatile i32 %clzg10, ptr %lz, align 4
+  lz = __builtin_clzg(ui, s);
+  // CHECK-NEXT: %21 = load i32, ptr %ui.addr, align 4
+  // CHECK-NEXT: %22 = call i32 @llvm.ctlz.i32(i32 %21, i1 true)
+  // CHECK-NEXT: %iszero11 = icmp eq i32 %21, 0
+  // CHECK-NEXT: %23 = load i16, ptr %s.addr, align 2
+  // CHECK-NEXT: %conv12 = sext i16 %23 to i32
+  // CHECK-NEXT: %clzg13 = select i1 %iszero11, i32 %conv12, i32 %22
+  // CHECK-NEXT: store volatile i32 %clzg13, ptr %lz, align 4
+  lz = __builtin_clzg(ul, us);
+  // CHECK-NEXT: %24 = load i64, ptr %ul.addr, align 8
+  // CHECK-NEXT: %25 = call i64 @llvm.ctlz.i64(i64 %24, i1 true)
+  // CHECK-NEXT: %cast14 = trunc i64 %25 to i32
+  // CHECK-NEXT: %iszero15 = icmp eq i64 %24, 0
+  // CHECK-NEXT: %26 = load i16, ptr %us.addr, align 2
+  // CHECK-NEXT: %conv16 = zext i16 %26 to i32
+  // CHECK-NEXT: %clzg17 = select i1 %iszero15, i32 %conv16, i32 %cast14
+  // CHECK-NEXT: store volatile i32 %clzg17, ptr %lz, align 4
+  lz = __builtin_clzg(ull, i);
+  // CHECK-NEXT: %27 = load i64, ptr %ull.addr, align 8
+  // CHECK-NEXT: %28 = call i64 @llvm.ctlz.i64(i64 %27, i1 true)
+  // CHECK-NEXT: %cast18 = trunc i64 %28 to i32
+  // CHECK-NEXT: %iszero19 = icmp eq i64 %27, 0
+  // CHECK-NEXT: %29 = load i32, ptr %i.addr, align 4
+  // CHECK-NEXT: %clzg20 = select i1 %iszero19, i32 %29, i32 %cast18
+  // CHECK-NEXT: store volatile i32 %clzg20, ptr %lz, align 4
+  lz = __builtin_clzg(ui128, i);
+  // CHECK-NEXT: %30 = load i128, ptr %ui128.addr, align 16
+  // CHECK-NEXT: %31 = call i128 @llvm.ctlz.i128(i128 %30, i1 true)
+  // CHECK-NEXT: %cast21 = trunc i128 %31 to i32
+  // CHECK-NEXT: %iszero22 = icmp eq i128 %30, 0
+  // CHECK-NEXT: %32 = load i32, ptr %i.addr, align 4
+  // CHECK-NEXT: %clzg23 = select i1 %iszero22, i32 %32, i32 %cast21
+  // CHECK-NEXT: store volatile i32 %clzg23, ptr %lz, align 4
+  lz = __builtin_clzg(ubi128, i);
+   // CHECK-NEXT: %33 = load i128, ptr %ubi128.addr, align 8
+  // CHECK-NEXT: %34 = call i128 @llvm.ctlz.i128(i128 %33, i1 true)
+  // CHECK-NEXT: %cast24 = trunc i128 %34 to i32
+  // CHECK-NEXT: %iszero25 = icmp eq i128 %33, 0
+  // CHECK-NEXT: %35 = load i32, ptr %i.addr, align 4
+  // CHECK-NEXT: %clzg26 = select i1 %iszero25, i32 %35, i32 %cast24
+  // CHECK-NEXT: store volatile i32 %clzg26, ptr %lz, align 4
+  // CHECK-NEXT: ret void
+}
+
+// CHECK-LABEL: define{{.*}} void @test_builtin_ctzg
+void test_builtin_ctzg(unsigned char uc, unsigned short us, unsigned int ui,
+                       unsigned long ul, unsigned long long ull,
+                       unsigned __int128 ui128, unsigned _BitInt(128) ubi128,
+                       signed char sc, short s, int i) {
+  volatile int tz;
+  tz = __builtin_ctzg(uc);
+  // CHECK: %1 = load i8, ptr %uc.addr, align 1
+  // CHECK-NEXT: %2 = call i8 @llvm.cttz.i8(i8 %1, i1 true)
+  // CHECK-NEXT: %cast = sext i8 %2 to i32
+  // CHECK-NEXT: store volatile i32 %cast, ptr %tz, align 4
+  tz = __builtin_ctzg(us);
+  // CHECK-NEXT: %3 = load i16, ptr %us.addr, align 2
+  // CHECK-NEXT: %4 = call i16 @llvm.cttz.i16(i16 %3, i1 true)
+  // CHECK-NEXT: %cast1 = sext i16 %4 to i32
+  // CHECK-NEXT: store volatile i32 %cast1, ptr %tz, align 4
+  tz = __builtin_ctzg(ui);
+  // CHECK-NEXT: %5 = load i32, ptr %ui.addr, align 4
+  // CHECK-NEXT: %6 = call i32 @llvm.cttz.i32(i32 %5, i1 true)
+  // CHECK-NEXT: store volatile i32 %6, ptr %tz, align 4
+  tz = __builtin_ctzg(ul);
+  // CHECK-NEXT: %7 = load i64, ptr %ul.addr, align 8
+  // CHECK-NEXT: %8 = call i64 @llvm.cttz.i64(i64 %7, i1 true)
+  // CHECK-NEXT: %cast2 = trunc i64 %8 to i32
+  // CHECK-NEXT: store volatile i32 %cast2, ptr %tz, align 4
+  tz = __builtin_ctzg(ull);
+  // CHECK-NEXT: %9 = load i64, ptr %ull.addr, align 8
+  // CHECK-NEXT: %10 = call i64 @llvm.cttz.i64(i64 %9, i1 true)
+  // CHECK-NEXT: %cast3 = trunc i64 %10 to i32
+  // CHECK-NEXT: store volatile i32 %cast3, ptr %tz, align 4
+  tz = __builtin_ctzg(ui128);
+  // CHECK-NEXT: %11 = load i128, ptr %ui128.addr, align 16
+  // CHECK-NEXT: %12 = call i128 @llvm.cttz.i128(i128 %11, i1 true)
+  // CHECK-NEXT: %cast4 = trunc i128 %12 to i32
+  // CHECK-NEXT: store volatile i32 %cast4, ptr %tz, align 4
+  tz = __builtin_ctzg(ubi128);
+  // CHECK-NEXT: %13 = load i128, ptr %ubi128.addr, align 8
+  // CHECK-NEXT: %14 = call i128 @llvm.cttz.i128(i128 %13, i1 true)
+  // CHECK-NEXT: %cast5 = trunc i128 %14 to i32
+  // CHECK-NEXT: store volatile i32 %cast5, ptr %tz, align 4
+  tz = __builtin_ctzg(uc, sc);
+  // CHECK-NEXT: %15 = load i8, ptr %uc.addr, align 1
+  // CHECK-NEXT: %16 = call i8 @llvm.cttz.i8(i8 %15, i1 true)
+  // CHECK-NEXT: %cast6 = sext i8 %16 to i32
+  // CHECK-NEXT: %iszero = icmp eq i8 %15, 0
+  // CHECK-NEXT: %17 = load i8, ptr %sc.addr, align 1
+  // CHECK-NEXT: %conv = sext i8 %17 to i32
+  // CHECK-NEXT: %ctzg = select i1 %iszero, i32 %conv, i32 %cast6
+  // CHECK-NEXT: store volatile i32 %ctzg, ptr %tz, align 4
+  tz = __builtin_ctzg(us, uc);
+  // CHECK-NEXT: %18 = load i16, ptr %us.addr, align 2
+  // CHECK-NEXT: %19 = call i16 @llvm.cttz.i16(i16 %18, i1 true)
+  // CHECK-NEXT: %cast7 = sext i16 %19 to i32
+  // CHECK-NEXT: %iszero8 = icmp eq i16 %18, 0
+  // CHECK-NEXT: %20 = load i8, ptr %uc.addr, align 1
+  // CHECK-NEXT: %conv9 = zext i8 %20 to i32
+  // CHECK-NEXT: %ctzg10 = select i1 %iszero8, i32 %conv9, i32 %cast7
+  // CHECK-NEXT: store volatile i32 %ctzg10, ptr %tz, align 4
+  tz = __builtin_ctzg(ui, s);
+  // CHECK-NEXT: %21 = load i32, ptr %ui.addr, align 4
+  // CHECK-NEXT: %22 = call i32 @llvm.cttz.i32(i32 %21, i1 true)
+  // CHECK-NEXT: %iszero11 = icmp eq i32 %21, 0
+  // CHECK-NEXT: %23 = load i16, ptr %s.addr, align 2
+  // CHECK-NEXT: %conv12 = sext i16 %23 to i32
+  // CHECK-NEXT: %ctzg13 = select i1 %iszero11, i32 %conv12, i32 %22
+  // CHECK-NEXT: store volatile i32 %ctzg13, ptr %tz, align 4
+  tz = __builtin_ctzg(ul, us);
+  // CHECK-NEXT: %24 = load i64, ptr %ul.addr, align 8
+  // CHECK-NEXT: %25 = call i64 @llvm.cttz.i64(i64 %24, i1 true)
+  // CHECK-NEXT: %cast14 = trunc i64 %25 to i32
+  // CHECK-NEXT: %iszero15 = icmp eq i64 %24, 0
+  // CHECK-NEXT: %26 = load i16, ptr %us.addr, align 2
+  // CHECK-NEXT: %conv16 = zext i16 %26 to i32
+  // CHECK-NEXT: %ctzg17 = select i1 %iszero15, i32 %conv16, i32 %cast14
+  // CHECK-NEXT: store volatile i32 %ctzg17, ptr %tz, align 4
+  tz = __builtin_ctzg(ull, i);
+  // CHECK-NEXT: %27 = load i64, ptr %ull.addr, align 8
+  // CHECK-NEXT: %28 = call i64 @llvm.cttz.i64(i64 %27, i1 true)
+  // CHECK-NEXT: %cast18 = trunc i64 %28 to i32
+  // CHECK-NEXT: %iszero19 = icmp eq i64 %27, 0
+  // CHECK-NEXT: %29 = load i32, ptr %i.addr, align 4
+  // CHECK-NEXT: %ctzg20 = select i1 %iszero19, i32 %29, i32 %cast18
+  // CHECK-NEXT: store volatile i32 %ctzg20, ptr %tz, align 4
+  tz = __builtin_ctzg(ui128, i);
+  // CHECK-NEXT: %30 = load i128, ptr %ui128.addr, align 16
+  // CHECK-NEXT: %31 = call i128 @llvm.cttz.i128(i128 %30, i1 true)
+  // CHECK-NEXT: %cast21 = trunc i128 %31 to i32
+  // CHECK-NEXT: %iszero22 = icmp eq i128 %30, 0
+  // CHECK-NEXT: %32 = load i32, ptr %i.addr, align 4
+  // CHECK-NEXT: %ctzg23 = select i1 %iszero22, i32 %32, i32 %cast21
+  // CHECK-NEXT: store volatile i32 %ctzg23, ptr %tz, align 4
+  tz = __builtin_ctzg(ubi128, i);
+  // CHECK-NEXT: %33 = load i128, ptr %ubi128.addr, align 8
+  // CHECK-NEXT: %34 = call i128 @llvm.cttz.i128(i128 %33, i1 true)
+  // CHECK-NEXT: %cast24 = trunc i128 %34 to i32
+  // CHECK-NEXT: %iszero25 = icmp eq i128 %33, 0
+  // CHECK-NEXT: %35 = load i32, ptr %i.addr, align 4
+  // CHECK-NEXT: %ctzg26 = select i1 %iszero25, i32 %35, i32 %cast24
+  // CHECK-NEXT: store volatile i32 %ctzg26, ptr %tz, align 4
+  // CHECK-NEXT: ret void
+}
+
 #endif
diff --git a/clang/test/CodeGen/swift-async-call-conv.c b/clang/test/CodeGen/swift-async-call-conv.c
index ce32c22fe809..39511698bbae 100644
--- a/clang/test/CodeGen/swift-async-call-conv.c
+++ b/clang/test/CodeGen/swift-async-call-conv.c
@@ -182,3 +182,19 @@ SWIFTASYNCCALL void async_struct_field_and_methods(int i, S &sref, S *sptr) {
 // CPPONLY-LABEL: define{{.*}} swifttailcc void @{{.*}}async_nonleaf_method2
 // CPPONLY: musttail call swifttailcc void @{{.*}}async_leaf_method
 #endif
+
+// Passing this as an argument requires a coerce-and-expand operation,
+// which requires a temporary.  Make sure that cleaning up that temporary
+// doesn't mess around with the musttail handling.
+struct coerce_and_expand {
+  char a,b,c,d;
+};
+struct coerce_and_expand return_coerced(void);
+SWIFTASYNCCALL void take_coerced_async(struct coerce_and_expand);
+
+// CHECK-LABEL: swifttailcc void @{{.*}}test_coerced
+SWIFTASYNCCALL void test_coerced() {
+  // CHECK:      musttail call swifttailcc void @{{.*}}take_coerced_async
+  // CHECK-NEXT: ret void
+  return take_coerced_async(return_coerced());
+}
diff --git a/clang/test/CodeGen/ubsan-builtin-checks.c b/clang/test/CodeGen/ubsan-builtin-checks.c
index 2bc32d8df485..c7f6078f903b 100644
--- a/clang/test/CodeGen/ubsan-builtin-checks.c
+++ b/clang/test/CodeGen/ubsan-builtin-checks.c
@@ -23,6 +23,9 @@ void check_ctz(int n) {
 
   // CHECK: call void @__ubsan_handle_invalid_builtin
   __builtin_ctzll(n);
+
+  // CHECK: call void @__ubsan_handle_invalid_builtin
+  __builtin_ctzg((unsigned int)n);
 }
 
 // CHECK: define{{.*}} void @check_clz
@@ -44,4 +47,7 @@ void check_clz(int n) {
 
   // CHECK: call void @__ubsan_handle_invalid_builtin
   __builtin_clzll(n);
+
+  // CHECK: call void @__ubsan_handle_invalid_builtin
+  __builtin_clzg((unsigned int)n);
 }
diff --git a/clang/test/CodeGenCXX/auto-var-init.cpp b/clang/test/CodeGenCXX/auto-var-init.cpp
index 991eb73fe45c..7803ed5b633f 100644
--- a/clang/test/CodeGenCXX/auto-var-init.cpp
+++ b/clang/test/CodeGenCXX/auto-var-init.cpp
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks %s -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK,CHECK-O0
 // RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks -ftrivial-auto-var-init=pattern %s -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK-O0,PATTERN,PATTERN-O0
-// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks -ftrivial-auto-var-init=pattern %s -O1 -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK-O1,PATTERN,PATTERN-O1
+// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks -ftrivial-auto-var-init=pattern %s -O1 -emit-llvm -o - | FileCheck %s -check-prefixes=PATTERN,PATTERN-O1
 // RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks -ftrivial-auto-var-init=zero %s -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK-O0,ZERO,ZERO-O0
-// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks -ftrivial-auto-var-init=zero %s -O1 -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK-O1,ZERO,ZERO-O1
+// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown -fblocks -ftrivial-auto-var-init=zero %s -O1 -emit-llvm -o - | FileCheck %s -check-prefixes=ZERO,ZERO-O1
 // RUN: %clang_cc1 -std=c++14 -triple i386-unknown-unknown -fblocks -ftrivial-auto-var-init=pattern %s -emit-llvm -o - | FileCheck %s -check-prefixes=CHECK-O0,PATTERN,PATTERN-O0
 
 #pragma clang diagnostic ignored "-Winaccessible-base"
@@ -1303,9 +1303,10 @@ TEST_CUSTOM(semivolatile, semivolatile, { 0x44444444, 0x44444444 });
 // CHECK-O0:  call void @llvm.memcpy
 // CHECK-NOT:   !annotation
 // CHECK-O0:  call void @{{.*}}used{{.*}}%custom)
-// CHECK-O1:  store i32 1145324612, ptr %custom, align 4
-// CHECK-O1-NEXT:  %[[I:[^ ]*]] = getelementptr inbounds i8, ptr %custom, i64 4
-// CHECK-O1-NEXT:  store i32 1145324612, ptr %[[I]], align 4
+// PATTERN-O1:       store i32 1145324612, ptr %custom, align 4
+// PATTERN-O1-NEXT:  %[[I:[^ ]*]] = getelementptr inbounds i8, ptr %custom, i64 4
+// PATTERN-O1-NEXT:  store i32 1145324612, ptr %[[I]], align 4
+// ZERO-O1:          store i64 4919131752989213764, ptr %custom, align 8
 // CHECK-NOT:   !annotation
 
 TEST_UNINIT(semivolatileinit, semivolatileinit);
@@ -1418,7 +1419,8 @@ TEST_CUSTOM(matching, matching, { .f = 0xf00f });
 // CHECK-O0:  call void @llvm.memcpy
 // CHECK-NOT:   !annotation
 // CHECK-O0:  call void @{{.*}}used{{.*}}%custom)
-// CHECK-O1:  store float 6.145500e+04, ptr {{.*}}, align 4
+// PATTERN-O1:  store float 6.145500e+04, ptr {{.*}}, align 4
+// ZERO-O1:     store i32 1198526208, ptr %custom, align 4
 // CHECK-NOT:   !annotation
 
 TEST_UNINIT(matchingreverse, matchingreverse);
@@ -1445,7 +1447,8 @@ TEST_CUSTOM(matchingreverse, matchingreverse, { .i = 0xf00f });
 // CHECK-O0:    call void @llvm.memcpy
 // CHECK-NOT:   !annotation
 // CHECK-O0:    call void @{{.*}}used{{.*}}%custom)
-// CHECK-O1:    store i32 61455, ptr %custom, align 4
+// PATTERN-O1:  store i32 61455, ptr %custom, align 4
+// ZERO-O1:     store i32 61455, ptr %custom, align 4
 // CHECK-NOT:   !annotation
 
 TEST_UNINIT(unmatched, unmatched);
@@ -1471,7 +1474,8 @@ TEST_CUSTOM(unmatched, unmatched, { .i = 0x3badbeef });
 // CHECK-O0:    call void @llvm.memcpy
 // CHECK-NOT:   !annotation
 // CHECK-O0:    call void @{{.*}}used{{.*}}%custom)
-// CHECK-O1:    store i32 1001242351, ptr {{.*}}, align 4
+// PATTERN-O1:  store i32 1001242351, ptr {{.*}}, align 4
+// ZERO-O1:     store i32 1001242351, ptr {{.*}}, align 4
 // CHECK-NOT:   !annotation
 
 TEST_UNINIT(unmatchedreverse, unmatchedreverse);
@@ -1504,9 +1508,7 @@ TEST_CUSTOM(unmatchedreverse, unmatchedreverse, { .c = 42  });
 // PATTERN-O1-NEXT:  store i8 -86, ptr %[[I]], align {{.*}}
 // PATTERN-O1-NEXT:  %[[I:[^ ]*]] = getelementptr inbounds i8, ptr %custom, i64 3
 // PATTERN-O1-NEXT:  store i8 -86, ptr %[[I]], align {{.*}}
-// ZERO-O1:     store i8 42, ptr {{.*}}, align 4
-// ZERO-O1-NEXT:  %[[I:[^ ]*]] = getelementptr inbounds i8, ptr %custom, i64 1
-// ZERO-O1-NEXT:  call void @llvm.memset.{{.*}}({{.*}}, i8 0, i64 3, {{.*}})
+// ZERO-O1:     store i32 42, ptr {{.*}}, align 4
 
 TEST_UNINIT(unmatchedfp, unmatchedfp);
 // CHECK-LABEL: @test_unmatchedfp_uninit()
@@ -1531,7 +1533,8 @@ TEST_CUSTOM(unmatchedfp, unmatchedfp, { .d = 3.1415926535897932384626433 });
 // CHECK-O0:    call void @llvm.memcpy
 // CHECK-NOT:   !annotation
 // CHECK-O0:    call void @{{.*}}used{{.*}}%custom)
-// CHECK-O1:    store double 0x400921FB54442D18, ptr %custom, align 8
+// PATTERN-O1:  store double 0x400921FB54442D18, ptr %custom, align 8
+// ZERO-O1:     store i64 4614256656552045848, ptr %custom, align 8
 // CHECK-NOT:   !annotation
 
 TEST_UNINIT(emptyenum, emptyenum);
diff --git a/clang/test/CodeGenCXX/x86_64-vaarg.cpp b/clang/test/CodeGenCXX/x86_64-vaarg.cpp
new file mode 100644
index 000000000000..f0177906a09a
--- /dev/null
+++ b/clang/test/CodeGenCXX/x86_64-vaarg.cpp
@@ -0,0 +1,23 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -x c -o - %s | FileCheck %s
+
+typedef struct { struct {} a; } empty;
+
+// CHECK-LABEL: @{{.*}}empty_record_test
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_EMPTY:%.*]], align 1
+// CHECK-NEXT:    [[Z_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[LIST:%.*]] = alloca [1 x %struct.__va_list_tag], align 16
+// CHECK-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_EMPTY]], align 1
+// CHECK-NEXT:    store i32 [[Z:%.*]], ptr [[Z_ADDR]], align 4
+// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
+// CHECK-NEXT:    call void @llvm.va_start(ptr [[ARRAYDECAY]])
+// CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[RETVAL]], ptr align 1 [[TMP]], i64 {{.*}}, i1 false)
+// CHECK-NEXT:    ret void
+empty empty_record_test(int z, ...) {
+  __builtin_va_list list;
+  __builtin_va_start(list, z);
+  return __builtin_va_arg(list, empty);
+}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-printf.cl b/clang/test/CodeGenOpenCL/amdgpu-printf.cl
index 6c84485b66b4..edf6dbf8657c 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-printf.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-printf.cl
@@ -30,14 +30,7 @@ __kernel void test_printf_int(int i) {
 // CHECK-NEXT:    [[S:%.*]] = alloca [4 x i8], align 1, addrspace(5)
 // CHECK-NEXT:    store i32 [[I:%.*]], ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA8]]
 // CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[S]]) #[[ATTR5:[0-9]+]]
-// CHECK-NEXT:    [[LOC0:%.*]] = getelementptr i8, ptr addrspace(5) [[S]], i64 0
-// CHECK-NEXT:    store i8 102, ptr addrspace(5) [[LOC0]], align 1
-// CHECK-NEXT:    [[LOC1:%.*]] = getelementptr i8, ptr addrspace(5) [[S]], i64 1
-// CHECK-NEXT:    store i8 111, ptr addrspace(5) [[LOC1]], align 1
-// CHECK-NEXT:    [[LOC2:%.*]] = getelementptr i8, ptr addrspace(5) [[S]], i64 2
-// CHECK-NEXT:    store i8 111, ptr addrspace(5) [[LOC2]], align 1
-// CHECK-NEXT:    [[LOC3:%.*]] = getelementptr i8, ptr addrspace(5) [[S]], i64 3
-// CHECK-NEXT:    store i8 0, ptr addrspace(5) [[LOC3]], align 1
+// CHECK-NEXT:    call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) align 1 [[S]], ptr addrspace(4) align 1 @__const.test_printf_str_int.s, i64 4, i1 false)
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [4 x i8], ptr addrspace(5) [[S]], i64 0, i64 0
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[I_ADDR]], align 4, !tbaa [[TBAA8]]
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) noundef @.str.2, ptr addrspace(5) noundef [[ARRAYDECAY]], i32 noundef [[TMP2]]) #[[ATTR4]]
diff --git a/clang/test/Driver/clang-offload-bundler-asserts-on.c b/clang/test/Driver/clang-offload-bundler-asserts-on.c
index 521c8641ff54..eb11d5fbbee4 100644
--- a/clang/test/Driver/clang-offload-bundler-asserts-on.c
+++ b/clang/test/Driver/clang-offload-bundler-asserts-on.c
@@ -1,6 +1,6 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: asserts
-// UNSUPPORTED: target={{.*}}-darwin{{.*}}, target={{.*}}-aix{{.*}}
+// UNSUPPORTED: target={{.*}}-macosx{{.*}}, target={{.*}}-darwin{{.*}}, target={{.*}}-aix{{.*}}
 
 // Generate the file we can bundle.
 // RUN: %clang -O0 -target %itanium_abi_triple %s -c -o %t.o
diff --git a/clang/test/Driver/clang-offload-bundler-standardize.c b/clang/test/Driver/clang-offload-bundler-standardize.c
index 6a24968c30ef..91dc8947aabb 100644
--- a/clang/test/Driver/clang-offload-bundler-standardize.c
+++ b/clang/test/Driver/clang-offload-bundler-standardize.c
@@ -1,6 +1,6 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: asserts
-// UNSUPPORTED: target={{.*}}-darwin{{.*}}, target={{.*}}-aix{{.*}}
+// UNSUPPORTED: target={{.*}}-macosx{{.*}}, target={{.*}}-darwin{{.*}}, target={{.*}}-aix{{.*}}
 // REQUIRES: asserts
 
 // Generate the file we can bundle.
diff --git a/clang/test/Driver/clang-offload-bundler.c b/clang/test/Driver/clang-offload-bundler.c
index f3cd2493e052..a56a5424abf8 100644
--- a/clang/test/Driver/clang-offload-bundler.c
+++ b/clang/test/Driver/clang-offload-bundler.c
@@ -1,5 +1,5 @@
 // REQUIRES: x86-registered-target
-// UNSUPPORTED: target={{.*}}-darwin{{.*}}, target={{.*}}-aix{{.*}}
+// UNSUPPORTED: target={{.*}}-macosx{{.*}}, target={{.*}}-darwin{{.*}}, target={{.*}}-aix{{.*}}
 
 //
 // Generate all the types of files we can bundle.
diff --git a/clang/test/Driver/fat-archive-unbundle-ext.c b/clang/test/Driver/fat-archive-unbundle-ext.c
index b409aa6313b1..e98b872f0c0c 100644
--- a/clang/test/Driver/fat-archive-unbundle-ext.c
+++ b/clang/test/Driver/fat-archive-unbundle-ext.c
@@ -1,5 +1,5 @@
 // REQUIRES: x86-registered-target
-// UNSUPPORTED: target={{.*-windows.*}}, target={{.*-darwin.*}}, target={{.*}}-aix{{.*}}
+// UNSUPPORTED: target={{.*-windows.*}}, target={{.*}}-macosx{{.*}}, target={{.*-darwin.*}}, target={{.*}}-aix{{.*}}
 
 // Generate dummy fat object
 // RUN: %clang -O0 -target %itanium_abi_triple %s -c -o %t.host.o
diff --git a/clang/test/Driver/modules-print-library-module-manifest-path.cpp b/clang/test/Driver/modules-print-library-module-manifest-path.cpp
index 24797002b80f..3ba2709ad95c 100644
--- a/clang/test/Driver/modules-print-library-module-manifest-path.cpp
+++ b/clang/test/Driver/modules-print-library-module-manifest-path.cpp
@@ -3,6 +3,7 @@
 // RUN: rm -rf %t && split-file %s %t && cd %t
 // RUN: mkdir -p %t/Inputs/usr/lib/x86_64-linux-gnu
 // RUN: touch %t/Inputs/usr/lib/x86_64-linux-gnu/libc++.so
+// RUN: touch %t/Inputs/usr/lib/x86_64-linux-gnu/libc++.a
 
 // RUN: %clang -print-library-module-manifest-path \
 // RUN:     -stdlib=libc++ \
@@ -10,13 +11,21 @@
 // RUN:     --target=x86_64-linux-gnu 2>&1 \
 // RUN:   | FileCheck libcxx-no-module-json.cpp
 
-// RUN: touch %t/Inputs/usr/lib/x86_64-linux-gnu/modules.json
+// RUN: touch %t/Inputs/usr/lib/x86_64-linux-gnu/libc++.modules.json
 // RUN: %clang -print-library-module-manifest-path \
 // RUN:     -stdlib=libc++ \
 // RUN:     -resource-dir=%t/Inputs/usr/lib/x86_64-linux-gnu \
 // RUN:     --target=x86_64-linux-gnu 2>&1 \
 // RUN:   | FileCheck libcxx.cpp
 
+// RUN: rm %t/Inputs/usr/lib/x86_64-linux-gnu/libc++.so
+// RUN: touch %t/Inputs/usr/lib/x86_64-linux-gnu/libc++.a
+// RUN: %clang -print-library-module-manifest-path \
+// RUN:     -stdlib=libc++ \
+// RUN:     -resource-dir=%t/Inputs/usr/lib/x86_64-linux-gnu \
+// RUN:     --target=x86_64-linux-gnu 2>&1 \
+// RUN:   | FileCheck libcxx-no-shared-lib.cpp
+
 // RUN: %clang -print-library-module-manifest-path \
 // RUN:     -stdlib=libstdc++ \
 // RUN:     -resource-dir=%t/Inputs/usr/lib/x86_64-linux-gnu \
@@ -29,7 +38,13 @@
 
 //--- libcxx.cpp
 
-// CHECK: {{.*}}/Inputs/usr/lib/x86_64-linux-gnu{{/|\\}}modules.json
+// CHECK: {{.*}}/Inputs/usr/lib/x86_64-linux-gnu{{/|\\}}libc++.modules.json
+
+//--- libcxx-no-shared-lib.cpp
+
+// Note this might find a different path depending whether search path
+// contains a different libc++.so.
+// CHECK: {{.*}}libc++.modules.json
 
 //--- libstdcxx.cpp
 
diff --git a/clang/test/Driver/unsupported-option-gpu.c b/clang/test/Driver/unsupported-option-gpu.c
index f23cb71ebfb0..5618b2cba72e 100644
--- a/clang/test/Driver/unsupported-option-gpu.c
+++ b/clang/test/Driver/unsupported-option-gpu.c
@@ -2,4 +2,5 @@
 // DEFINE: %{check} = %clang -### --target=x86_64-linux-gnu -c -mcmodel=medium
 
 // RUN: %{check} -x cuda %s --cuda-path=%S/Inputs/CUDA/usr/local/cuda --offload-arch=sm_60 --no-cuda-version-check -fbasic-block-sections=all
+// RUN: %{check} -x hip %s --offload=spirv64 -nogpulib -nogpuinc
 // RUN: %{check} -x hip %s --rocm-path=%S/Inputs/rocm -nogpulib -nogpuinc
diff --git a/clang/test/Format/fail-on-incomplete.cpp b/clang/test/Format/fail-on-incomplete.cpp
new file mode 100644
index 000000000000..ccd77af4d599
--- /dev/null
+++ b/clang/test/Format/fail-on-incomplete.cpp
@@ -0,0 +1,4 @@
+// RUN: not clang-format -style=LLVM -fail-on-incomplete-format %s
+// RUN: clang-format -style=LLVM %s
+
+int a(
diff --git a/clang/test/InstallAPI/diagnostics-cpp.test b/clang/test/InstallAPI/diagnostics-cpp.test
index 658886537507..51cca129ea0a 100644
--- a/clang/test/InstallAPI/diagnostics-cpp.test
+++ b/clang/test/InstallAPI/diagnostics-cpp.test
@@ -21,6 +21,8 @@ CHECK-NEXT: CPP.h:5:7: error: declaration has external linkage, but symbol has i
 CHECK-NEXT: CPP.h:6:7: error: dynamic library symbol '(weak-def) Bar::init()' is weak defined, but its declaration is not
 CHECK-NEXT:   int init();
 CHECK-NEXT:       ^
+CHECK-NEXT: warning: violations found for arm64
+CHECK-NEXT: error: no declaration found for exported symbol 'int foo<unsigned int>(unsigned int)' in dynamic library
 
 //--- inputs.json.in
 {
diff --git a/clang/test/InstallAPI/linker-symbols.test b/clang/test/InstallAPI/linker-symbols.test
new file mode 100644
index 000000000000..1e4ddf9c45d5
--- /dev/null
+++ b/clang/test/InstallAPI/linker-symbols.test
@@ -0,0 +1,440 @@
+; RUN: rm -rf %t
+; RUN: split-file %s %t
+; RUN: sed -e "s|DSTROOT|%/t|g" %t/inputs.json.in > %t/inputs.json
+
+; RUN: yaml2obj %t/MagicSymbols.yaml -o %t/MagicSymbols
+
+; RUN: not clang-installapi -target x86_64-apple-macosx13 \
+; RUN: -install_name \
+; RUN: /System/Library/Frameworks/SpecialLinkerSymbols.framework/Versions/A/SpecialLinkerSymbols \
+; RUN: -current_version 1 -compatibility_version 1 \
+; RUN: %t/inputs.json -o %t/output.tbd \
+; RUN: --verify-mode=ErrorsOnly \
+; RUN: --verify-against=%t/MagicSymbols 2>&1 | FileCheck %s
+
+CHECK: warning: violations found for x86_64
+CHECK: error: no declaration found for exported symbol '$ld$add$os10.4$_symbol2' in dynamic library
+CHECK: error: no declaration found for exported symbol '$ld$add$os10.5$_symbol2' in dynamic library
+CHECK: error: no declaration found for exported symbol '$ld$hide$os10.6$_symbol1' in dynamic library
+CHECK: error: no declaration found for exported symbol '$ld$hide$os10.7$_symbol1' in dynamic library
+CHECK: error: no declaration found for exported symbol '$ld$weak$os10.5$_symbol3' in dynamic library
+CHECK: error: no declaration found for exported symbol '$ld$weak$os10.4$_symbol3' in dynamic library
+CHECK: error: no declaration found for exported symbol '$ld$install_name$os10.4$/System/Library/Frameworks/A.framework/Versions/A/A' in dynamic library
+CHECK: error: no declaration found for exported symbol '$ld$install_name$os10.5$/System/Library/Frameworks/B.framework/Versions/A/B' in dynamic library
+
+;--- MagicSymbols.h
+#ifndef SPECIAL_LINKER_SYMBOLS_H
+#define SPECIAL_LINKER_SYMBOLS_H
+
+extern const int SpecialLinkerSymbolsVersion;
+
+extern int symbol1;
+extern int symbol3;
+
+#endif // SPECIAL_LINKER_SYMBOLS_H
+
+;--- inputs.json.in
+{
+  "headers": [ {
+    "path" : "DSTROOT/MagicSymbols.h",
+    "type" : "project"
+  }
+  ],
+  "version": "3"
+}
+
+;--- MagicSymbols.yaml
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x1000007
+  cpusubtype:      0x3
+  filetype:        0x6
+  ncmds:           12
+  sizeofcmds:      952
+  flags:           0x100085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __TEXT
+    vmaddr:          0
+    vmsize:          4096
+    fileoff:         0
+    filesize:        4096
+    maxprot:         5
+    initprot:        5
+    nsects:          2
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0xBD8
+        size:            0
+        offset:          0xBD8
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000000
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         ''
+      - sectname:        __const
+        segname:         __TEXT
+        addr:            0xBD8
+        size:            4
+        offset:          0xBD8
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '07000000'
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __DATA
+    vmaddr:          4096
+    vmsize:          4096
+    fileoff:         4096
+    filesize:        4096
+    maxprot:         3
+    initprot:        3
+    nsects:          2
+    flags:           0
+    Sections:
+      - sectname:        __data
+        segname:         __DATA
+        addr:            0x1000
+        size:            8
+        offset:          0x1000
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         4D00000009030000
+      - sectname:        __common
+        segname:         __DATA
+        addr:            0x1008
+        size:            8
+        offset:          0x0
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x1
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          8192
+    vmsize:          944
+    fileoff:         8192
+    filesize:        944
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_DYLD_INFO_ONLY
+    cmdsize:         48
+    rebase_off:      0
+    rebase_size:     0
+    bind_off:        0
+    bind_size:       0
+    weak_bind_off:   0
+    weak_bind_size:  0
+    lazy_bind_off:   0
+    lazy_bind_size:  0
+    export_off:      8192
+    export_size:     376
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          8576
+    nsyms:           12
+    stroff:          8768
+    strsize:         368
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       0
+    iextdefsym:      0
+    nextdefsym:      11
+    iundefsym:       11
+    nundefsym:       1
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  0
+    nindirectsyms:   0
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+  - cmd:             LC_ID_DYLIB
+    cmdsize:         120
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 65536
+      compatibility_version: 65536
+    Content:         '/System/Library/Frameworks/SpecialLinkerSymbols.framework/Versions/A/SpecialLinkerSymbols'
+    ZeroPadBytes:    7
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            4C4C4478-5555-3144-A106-356C3C9DACA3
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           851968
+    sdk:             983040
+    ntools:          1
+    Tools:
+      - tool:            4
+        version:         1245184
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 88539136
+      compatibility_version: 65536
+    Content:         '/usr/lib/libSystem.B.dylib'
+    ZeroPadBytes:    6
+  - cmd:             LC_FUNCTION_STARTS
+    cmdsize:         16
+    dataoff:         8568
+    datasize:        8
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         8576
+    datasize:        0
+LinkEditData:
+  ExportTrie:
+    TerminalSize:    0
+    NodeOffset:      0
+    Name:            ''
+    Flags:           0x0
+    Address:         0x0
+    Other:           0x0
+    ImportName:      ''
+    Children:
+      - TerminalSize:    0
+        NodeOffset:      11
+        Name:            _
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    3
+            NodeOffset:      50
+            Name:            SpecialLinkerSymbolsVersion
+            Flags:           0x0
+            Address:         0xBD8
+            Other:           0x0
+            ImportName:      ''
+          - TerminalSize:    0
+            NodeOffset:      55
+            Name:            symbol
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
+            Children:
+              - TerminalSize:    3
+                NodeOffset:      63
+                Name:            '3'
+                Flags:           0x0
+                Address:         0x1004
+                Other:           0x0
+                ImportName:      ''
+              - TerminalSize:    3
+                NodeOffset:      68
+                Name:            '1'
+                Flags:           0x0
+                Address:         0x1000
+                Other:           0x0
+                ImportName:      ''
+      - TerminalSize:    0
+        NodeOffset:      73
+        Name:            '$ld$'
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    0
+            NodeOffset:      134
+            Name:            'add$os10.'
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
+            Children:
+              - TerminalSize:    3
+                NodeOffset:      162
+                Name:            '4$_symbol2'
+                Flags:           0x0
+                Address:         0x1008
+                Other:           0x0
+                ImportName:      ''
+              - TerminalSize:    3
+                NodeOffset:      167
+                Name:            '5$_symbol2'
+                Flags:           0x0
+                Address:         0x1009
+                Other:           0x0
+                ImportName:      ''
+          - TerminalSize:    0
+            NodeOffset:      172
+            Name:            'hide$os10.'
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
+            Children:
+              - TerminalSize:    3
+                NodeOffset:      200
+                Name:            '6$_symbol1'
+                Flags:           0x0
+                Address:         0x100A
+                Other:           0x0
+                ImportName:      ''
+              - TerminalSize:    3
+                NodeOffset:      205
+                Name:            '7$_symbol1'
+                Flags:           0x0
+                Address:         0x100B
+                Other:           0x0
+                ImportName:      ''
+          - TerminalSize:    0
+            NodeOffset:      210
+            Name:            'weak$os10.'
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
+            Children:
+              - TerminalSize:    3
+                NodeOffset:      238
+                Name:            '5$_symbol3'
+                Flags:           0x0
+                Address:         0x100F
+                Other:           0x0
+                ImportName:      ''
+              - TerminalSize:    3
+                NodeOffset:      243
+                Name:            '4$_symbol3'
+                Flags:           0x0
+                Address:         0x100E
+                Other:           0x0
+                ImportName:      ''
+          - TerminalSize:    0
+            NodeOffset:      248
+            Name:            'install_name$os10.'
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
+            Children:
+              - TerminalSize:    3
+                NodeOffset:      362
+                Name:            '4$/System/Library/Frameworks/A.framework/Versions/A/A'
+                Flags:           0x0
+                Address:         0x100C
+                Other:           0x0
+                ImportName:      ''
+              - TerminalSize:    3
+                NodeOffset:      367
+                Name:            '5$/System/Library/Frameworks/B.framework/Versions/A/B'
+                Flags:           0x0
+                Address:         0x100D
+                Other:           0x0
+                ImportName:      ''
+  NameList:
+    - n_strx:          2
+      n_type:          0xF
+      n_sect:          4
+      n_desc:          0
+      n_value:         4104
+    - n_strx:          26
+      n_type:          0xF
+      n_sect:          4
+      n_desc:          0
+      n_value:         4105
+    - n_strx:          50
+      n_type:          0xF
+      n_sect:          4
+      n_desc:          0
+      n_value:         4106
+    - n_strx:          75
+      n_type:          0xF
+      n_sect:          4
+      n_desc:          0
+      n_value:         4107
+    - n_strx:          100
+      n_type:          0xF
+      n_sect:          4
+      n_desc:          0
+      n_value:         4108
+    - n_strx:          176
+      n_type:          0xF
+      n_sect:          4
+      n_desc:          0
+      n_value:         4109
+    - n_strx:          252
+      n_type:          0xF
+      n_sect:          4
+      n_desc:          0
+      n_value:         4110
+    - n_strx:          277
+      n_type:          0xF
+      n_sect:          4
+      n_desc:          0
+      n_value:         4111
+    - n_strx:          302
+      n_type:          0xF
+      n_sect:          2
+      n_desc:          0
+      n_value:         3032
+    - n_strx:          331
+      n_type:          0xF
+      n_sect:          3
+      n_desc:          0
+      n_value:         4096
+    - n_strx:          340
+      n_type:          0xF
+      n_sect:          3
+      n_desc:          0
+      n_value:         4100
+    - n_strx:          349
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          256
+      n_value:         0
+  StringTable:
+    - ' '
+    - '$ld$add$os10.4$_symbol2'
+    - '$ld$add$os10.5$_symbol2'
+    - '$ld$hide$os10.6$_symbol1'
+    - '$ld$hide$os10.7$_symbol1'
+    - '$ld$install_name$os10.4$/System/Library/Frameworks/A.framework/Versions/A/A'
+    - '$ld$install_name$os10.5$/System/Library/Frameworks/B.framework/Versions/A/B'
+    - '$ld$weak$os10.4$_symbol3'
+    - '$ld$weak$os10.5$_symbol3'
+    - _SpecialLinkerSymbolsVersion
+    - _symbol1
+    - _symbol3
+    - dyld_stub_binder
+    - ''
+    - ''
+...
diff --git a/clang/test/InstallAPI/mismatching-objc-class-symbols.test b/clang/test/InstallAPI/mismatching-objc-class-symbols.test
new file mode 100644
index 000000000000..3b4acf1035ac
--- /dev/null
+++ b/clang/test/InstallAPI/mismatching-objc-class-symbols.test
@@ -0,0 +1,269 @@
+; RUN: rm -rf %t
+; RUN: split-file %s %t
+; RUN: sed -e "s|DSTROOT|%/t|g" %t/inputs.json.in > %t/inputs.json
+; RUN: yaml2obj %t/swift-objc-class.yaml -o %t/libswift-objc.dylib
+
+// Try out dylib that only has 1 symbol for a ObjCClass, with no declarations in header. 
+; RUN: clang-installapi -target arm64-apple-macos14 -dynamiclib \
+; RUN: -install_name tmp.dylib --verify-against=%t/libswift-objc.dylib \
+; RUN: -I%t/usr/include %t/inputs.json -o %t/missing.tbd \
+; RUN: --verify-mode=ErrorsAndWarnings 2>&1 | FileCheck --check-prefix MISSING_DECL %s
+; RUN: llvm-readtapi --compare %t/missing.tbd %t/missing-expected.tbd 
+
+// Try out a dylib that only has 1 symbol for a ObjCClass, 
+// but a complete ObjCClass decl in header.
+; RUN: clang-installapi -target arm64-apple-macos14 -dynamiclib \
+; RUN: -install_name tmp.dylib --verify-against=%t/libswift-objc.dylib \
+; RUN: -I%t/usr/include %t/inputs.json -o %t/mismatching.tbd \
+; RUN: --verify-mode=Pedantic -DFULL_DECL 2>&1 | FileCheck --check-prefix MISMATCH_DECL %s
+; RUN: llvm-readtapi -compare %t/mismatching.tbd  %t/mismatching-expected.tbd
+
+// Try out a dylib that only has 1 symbol for a ObjCClass, but is represented in header. 
+; RUN: clang-installapi -target arm64-apple-macos14 \
+; RUN: -install_name tmp.dylib --verify-against=%t/libswift-objc.dylib \
+; RUN: -I%t/usr/include %t/inputs.json -o %t/matching.tbd \
+; RUN: --verify-mode=Pedantic \
+; RUN: -DHAS_META_DECL 2>&1 | FileCheck --allow-empty %s
+
+; MISSING_DECL:        violations found for arm64
+; MISSING_DECL-NEXT:   warning: no declaration was found for exported symbol 'Metaclass of Suggestion' in dynamic library
+
+; MISMATCH_DECL:       violations found for arm64-apple-macos14
+; MISMATCH_DECL:       warning: declaration has external linkage, but dynamic library doesn't have symbol 'Class of Suggestion'
+
+; CHECK-NOT:           error
+; CHECK-NOT:           warning 
+
+
+;--- usr/include/mismatch.h
+#if HAS_META_DECL
+int metaclass __asm("_OBJC_METACLASS_$_Suggestion");
+#endif 
+
+#if FULL_DECL
+@interface Suggestion 
+@end
+#endif
+
+;--- inputs.json.in
+{
+  "headers": [ {
+    "path" : "DSTROOT/usr/include/mismatch.h",
+    "type" : "public"
+  }
+  ],
+  "version": "3"
+}
+
+;--- missing-expected.tbd
+--- !tapi-tbd
+tbd-version:     4
+targets:         [ arm64-macos ]
+flags:           [ not_app_extension_safe ]
+install-name:    tmp.dylib
+current-version: 0
+compatibility-version: 0
+...
+
+;--- mismatching-expected.tbd
+--- !tapi-tbd
+tbd-version:     4
+targets:         [ arm64-macos ]
+flags:           [ not_app_extension_safe ]
+install-name:    tmp.dylib
+current-version: 0
+compatibility-version: 0
+exports:
+  - targets:         [ arm64-macos ]
+    objc-classes:    [ Suggestion ]
+...
+
+;--- swift-objc-class.yaml
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x6
+  ncmds:           13
+  sizeofcmds:      752
+  flags:           0x100085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __TEXT
+    vmaddr:          0
+    vmsize:          16384
+    fileoff:         0
+    filesize:        16384
+    maxprot:         5
+    initprot:        5
+    nsects:          2
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x330
+        size:            0
+        offset:          0x330
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000000
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         ''
+      - sectname:        __const
+        segname:         __TEXT
+        addr:            0x330
+        size:            1
+        offset:          0x330
+        align:           0
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         '61'
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          16384
+    vmsize:          416
+    fileoff:         16384
+    filesize:        416
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_DYLD_INFO_ONLY
+    cmdsize:         48
+    rebase_off:      0
+    rebase_size:     0
+    bind_off:        0
+    bind_size:       0
+    weak_bind_off:   0
+    weak_bind_size:  0
+    lazy_bind_off:   0
+    lazy_bind_size:  0
+    export_off:      16384
+    export_size:     40
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          16432
+    nsyms:           2
+    stroff:          16464
+    strsize:         48
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       0
+    iextdefsym:      0
+    nextdefsym:      1
+    iundefsym:       1
+    nundefsym:       1
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  0
+    nindirectsyms:   0
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+  - cmd:             LC_ID_DYLIB
+    cmdsize:         40
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 0
+      compatibility_version: 0
+    Content:         tmp.dylib
+    ZeroPadBytes:    7
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            4C4C4443-5555-3144-A142-97179769CBE0
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           917504
+    sdk:             983040
+    ntools:          1
+    Tools:
+      - tool:            4
+        version:         1245184
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         96
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 197656576
+      compatibility_version: 19660800
+    Content:         '/System/Library/Frameworks/Foundation.framework/Versions/C/Foundation'
+    ZeroPadBytes:    3
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 88473600
+      compatibility_version: 65536
+    Content:         '/usr/lib/libSystem.B.dylib'
+    ZeroPadBytes:    6
+  - cmd:             LC_FUNCTION_STARTS
+    cmdsize:         16
+    dataoff:         16424
+    datasize:        8
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         16432
+    datasize:        0
+  - cmd:             LC_CODE_SIGNATURE
+    cmdsize:         16
+    dataoff:         16512
+    datasize:        288
+LinkEditData:
+  ExportTrie:
+    TerminalSize:    0
+    NodeOffset:      0
+    Name:            ''
+    Flags:           0x0
+    Address:         0x0
+    Other:           0x0
+    ImportName:      ''
+    Children:
+      - TerminalSize:    3
+        NodeOffset:      32
+        Name:            '_OBJC_METACLASS_$_Suggestion'
+        Flags:           0x0
+        Address:         0x330
+        Other:           0x0
+        ImportName:      ''
+  NameList:
+    - n_strx:          2
+      n_type:          0xF
+      n_sect:          2
+      n_desc:          0
+      n_value:         816
+    - n_strx:          31
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+  StringTable:
+    - ' '
+    - '_OBJC_METACLASS_$_Suggestion'
+    - dyld_stub_binder
+  FunctionStarts:  [ 0x330 ]
+...
+// Generated from: 
+// xcrun -sdk macosx clang tmp.c -dynamiclib -install_name tmp.dylib
+// tmp.c: 
+// __attribute__((visibility("default")))
+// const char Meta __asm("_OBJC_METACLASS_$_Suggestion") = 'a';
diff --git a/clang/test/InstallAPI/symbol-flags.test b/clang/test/InstallAPI/symbol-flags.test
new file mode 100644
index 000000000000..3f68afd17e3b
--- /dev/null
+++ b/clang/test/InstallAPI/symbol-flags.test
@@ -0,0 +1,290 @@
+; RUN: rm -rf %t
+; RUN: split-file %s %t
+; RUN: sed -e "s|DSTROOT|%/t|g" %t/inputs.json.in > %t/inputs.json
+
+; RUN: yaml2obj %t/flags.yaml -o %t/SymbolFlags
+
+; RUN: not clang-installapi -x c++ --target=arm64-apple-macos13 \
+; RUN: -install_name /System/Library/Frameworks/SymbolFlags.framework/Versions/A/SymbolFlags \
+; RUN: -current_version 1 -compatibility_version 1 \
+; RUN: %t/inputs.json -o output.tbd \
+; RUN: --verify-against=%t/SymbolFlags \
+; RUN: --verify-mode=ErrorsOnly 2>&1 | FileCheck %s
+
+; CHECK: project.h:2:21: error: declaration '(tlv) val' is thread local, but symbol is not in dynamic library
+; CHECK-NEXT: extern __thread int val;
+; CHECK: project.h:3:13: error: dynamic library symbol '(weak-def) __Z12my_weak_funcv' is weak defined, but its declaration is not
+; CHECK-NEXT: extern void my_weak_func();
+
+;--- project.h
+extern void my_func();
+extern __thread int val;
+extern void my_weak_func();
+
+;--- inputs.json.in
+{
+  "headers": [ {
+    "path" : "DSTROOT/project.h",
+    "type" : "project"
+  }
+  ],
+  "version": "3"
+}
+
+;--- flags.yaml
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x100000C
+  cpusubtype:      0x0
+  filetype:        0x6
+  ncmds:           14
+  sizeofcmds:      912
+  flags:           0x118085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __TEXT
+    vmaddr:          0
+    vmsize:          16384
+    fileoff:         0
+    filesize:        16384
+    maxprot:         5
+    initprot:        5
+    nsects:          2
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0xFB0
+        size:            8
+        offset:          0xFB0
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         C0035FD6C0035FD6
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0xFB8
+        size:            4152
+        offset:          0xFB8
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x0
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         010000001C000000010000002000000000000000200000000200000000000002B00F00003800000038000000B80F00000000000038000000030000000C0001001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __DATA
+    vmaddr:          16384
+    vmsize:          16384
+    fileoff:         16384
+    filesize:        0
+    maxprot:         3
+    initprot:        3
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __common
+        segname:         __DATA
+        addr:            0x4000
+        size:            4
+        offset:          0x0
+        align:           2
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x1
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          32768
+    vmsize:          480
+    fileoff:         16384
+    filesize:        480
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_DYLD_INFO_ONLY
+    cmdsize:         48
+    rebase_off:      0
+    rebase_size:     0
+    bind_off:        0
+    bind_size:       0
+    weak_bind_off:   0
+    weak_bind_size:  0
+    lazy_bind_off:   0
+    lazy_bind_size:  0
+    export_off:      16384
+    export_size:     64
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          16456
+    nsyms:           4
+    stroff:          16520
+    strsize:         56
+  - cmd:             LC_DYSYMTAB
+    cmdsize:         80
+    ilocalsym:       0
+    nlocalsym:       0
+    iextdefsym:      0
+    nextdefsym:      3
+    iundefsym:       3
+    nundefsym:       1
+    tocoff:          0
+    ntoc:            0
+    modtaboff:       0
+    nmodtab:         0
+    extrefsymoff:    0
+    nextrefsyms:     0
+    indirectsymoff:  0
+    nindirectsyms:   0
+    extreloff:       0
+    nextrel:         0
+    locreloff:       0
+    nlocrel:         0
+  - cmd:             LC_ID_DYLIB
+    cmdsize:         96
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 65536
+      compatibility_version: 65536
+    Content:         '/System/Library/Frameworks/SymbolFlags.framework/Versions/A/SymbolFlags'
+    ZeroPadBytes:    1
+  - cmd:             LC_UUID
+    cmdsize:         24
+    uuid:            4C4C4436-5555-3144-A1AF-5D3063ACFC99
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           851968
+    sdk:             983040
+    ntools:          1
+    Tools:
+      - tool:            4
+        version:         1245184
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         48
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 117985024
+      compatibility_version: 65536
+    Content:         '/usr/lib/libc++.1.dylib'
+    ZeroPadBytes:    1
+  - cmd:             LC_LOAD_DYLIB
+    cmdsize:         56
+    dylib:
+      name:            24
+      timestamp:       0
+      current_version: 88473600
+      compatibility_version: 65536
+    Content:         '/usr/lib/libSystem.B.dylib'
+    ZeroPadBytes:    6
+  - cmd:             LC_FUNCTION_STARTS
+    cmdsize:         16
+    dataoff:         16448
+    datasize:        8
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         16456
+    datasize:        0
+  - cmd:             LC_CODE_SIGNATURE
+    cmdsize:         16
+    dataoff:         16576
+    datasize:        288
+LinkEditData:
+  ExportTrie:
+    TerminalSize:    0
+    NodeOffset:      0
+    Name:            ''
+    Flags:           0x0
+    Address:         0x0
+    Other:           0x0
+    ImportName:      ''
+    Children:
+      - TerminalSize:    0
+        NodeOffset:      5
+        Name:            _
+        Flags:           0x0
+        Address:         0x0
+        Other:           0x0
+        ImportName:      ''
+        Children:
+          - TerminalSize:    4
+            NodeOffset:      16
+            Name:            val
+            Flags:           0x0
+            Address:         0x4000
+            Other:           0x0
+            ImportName:      ''
+          - TerminalSize:    0
+            NodeOffset:      22
+            Name:            _Z
+            Flags:           0x0
+            Address:         0x0
+            Other:           0x0
+            ImportName:      ''
+            Children:
+              - TerminalSize:    3
+                NodeOffset:      52
+                Name:            7my_funcv
+                Flags:           0x0
+                Address:         0xFB0
+                Other:           0x0
+                ImportName:      ''
+              - TerminalSize:    3
+                NodeOffset:      57
+                Name:            12my_weak_funcv
+                Flags:           0x4
+                Address:         0xFB4
+                Other:           0x0
+                ImportName:      ''
+  NameList:
+    - n_strx:          2
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         4016
+    - n_strx:          15
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          128
+      n_value:         4020
+    - n_strx:          34
+      n_type:          0xF
+      n_sect:          3
+      n_desc:          0
+      n_value:         16384
+    - n_strx:          39
+      n_type:          0x1
+      n_sect:          0
+      n_desc:          512
+      n_value:         0
+  StringTable:
+    - ' '
+    - __Z7my_funcv
+    - __Z12my_weak_funcv
+    - _val
+    - dyld_stub_binder
+  FunctionStarts:  [ 0xFB0, 0xFB4 ]
+...
+
+/// Generated from: 
+// clang++ -mtargetos=macosx13 -arch arm64  flags.cpp
+// flags.cpp: 
+//  __attribute__((visibility("default"))) void my_func() {}
+//  __attribute__((weak)) void my_weak_func() {}
+//  int val = 0;
diff --git a/clang/test/Lexer/has_extension_cxx.cpp b/clang/test/Lexer/has_extension_cxx.cpp
index 7941997428ac..7366029d3727 100644
--- a/clang/test/Lexer/has_extension_cxx.cpp
+++ b/clang/test/Lexer/has_extension_cxx.cpp
@@ -33,6 +33,11 @@ int has_deleted_functions();
 int has_inline_namespaces();
 #endif
 
+// CHECK: has_lambdas
+#if __has_extension(cxx_lambdas)
+int has_lambdas();
+#endif
+
 // CHECK: has_override_control
 #if __has_extension(cxx_override_control)
 int has_override_control();
diff --git a/clang/test/OpenMP/bug54082.c b/clang/test/OpenMP/bug54082.c
index b88b68fd4301..337c120983e0 100644
--- a/clang/test/OpenMP/bug54082.c
+++ b/clang/test/OpenMP/bug54082.c
@@ -69,9 +69,7 @@ void foo() {
 // CHECK-NEXT:    [[X_TRAITS:%.*]] = alloca [1 x %struct.omp_alloctrait_t], align 16
 // CHECK-NEXT:    [[X_ALLOC:%.*]] = alloca i64, align 8
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[X_TRAITS]]) #[[ATTR5:[0-9]+]]
-// CHECK-NEXT:    store i32 2, ptr [[X_TRAITS]], align 16
-// CHECK-NEXT:    [[LOC0:%.*]] = getelementptr inbounds i8, ptr [[X_TRAITS]], i64 8
-// CHECK-NEXT:    store i64 64, ptr [[LOC0]], align 8
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(16) [[X_TRAITS]], ptr noundef nonnull align 16 dereferenceable(16) @__const.foo.x_traits, i64 16, i1 false)
 // CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[X_ALLOC]]) #[[ATTR5]]
 // CHECK-NEXT:    [[CALL:%.*]] = call i64 @omp_init_allocator(i64 noundef 0, i32 noundef 1, ptr noundef nonnull [[X_TRAITS]]) #[[ATTR5]]
 // CHECK-NEXT:    store i64 [[CALL]], ptr [[X_ALLOC]], align 8, !tbaa [[TBAA3:![0-9]+]]
diff --git a/clang/test/OpenMP/declare_reduction_messages.cpp b/clang/test/OpenMP/declare_reduction_messages.cpp
index 38a5d766eead..752cc4fb05a1 100644
--- a/clang/test/OpenMP/declare_reduction_messages.cpp
+++ b/clang/test/OpenMP/declare_reduction_messages.cpp
@@ -58,16 +58,10 @@ class Class2 : public Class1<T> {
 #pragma omp declare reduction(fun1 : long : omp_out += omp_in) initializer                              // expected-error {{expected '(' after 'initializer'}}
 #pragma omp declare reduction(fun2 : long : omp_out += omp_in) initializer {                            // expected-error {{expected '(' after 'initializer'}} expected-error {{expected expression}} expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}}
 #pragma omp declare reduction(fun3 : long : omp_out += omp_in) initializer[
-#if __cplusplus <= 199711L
-// expected-error@-2 {{expected '(' after 'initializer'}}
-// expected-error@-3 {{expected expression}}
-// expected-warning@-4 {{extra tokens at the end of '#pragma omp declare reduction' are ignored}}
-#else
-// expected-error@-6 {{expected '(' after 'initializer'}}
-// expected-error@-7 {{expected variable name or 'this' in lambda capture list}}
-// expected-error@-8 {{expected ')'}}
-// expected-note@-9 {{to match this '('}}
-#endif
+// expected-error@-1 {{expected '(' after 'initializer'}}
+// expected-error@-2 {{expected variable name or 'this' in lambda capture list}}
+// expected-error@-3 {{expected ')'}}
+// expected-note@-4 {{to match this '('}}
 #pragma omp declare reduction(fun4 : long : omp_out += omp_in) initializer()                            // expected-error {{expected expression}}
 #pragma omp declare reduction(fun5 : long : omp_out += omp_in) initializer(temp)                        // expected-error {{only 'omp_priv' or 'omp_orig' variables are allowed in initializer expression}}
 #pragma omp declare reduction(fun6 : long : omp_out += omp_in) initializer(omp_orig                     // expected-error {{expected ')'}} expected-note {{to match this '('}}
diff --git a/clang/test/OpenMP/openmp_check.cpp b/clang/test/OpenMP/openmp_check.cpp
index 6a8dd17fc836..b52ce0c06692 100644
--- a/clang/test/OpenMP/openmp_check.cpp
+++ b/clang/test/OpenMP/openmp_check.cpp
@@ -18,7 +18,7 @@ int nested(int a) {
   auto F = [&]() {
 #if __cplusplus <= 199711L
   // expected-warning@-2 {{'auto' type specifier is a C++11 extension}}
-  // expected-error@-3 {{expected expression}}
+  // expected-warning@-3 {{lambdas are a C++11 extension}}
 #endif
 
 #pragma omp parallel
diff --git a/clang/test/Parser/cxx03-lambda-extension.cpp b/clang/test/Parser/cxx03-lambda-extension.cpp
new file mode 100644
index 000000000000..82ae7da30530
--- /dev/null
+++ b/clang/test/Parser/cxx03-lambda-extension.cpp
@@ -0,0 +1,5 @@
+// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify -std=c++03 %s
+
+void func() {
+  []() {}; // expected-warning {{lambdas are a C++11 extension}}
+}
diff --git a/clang/test/Parser/cxx0x-lambda-expressions.cpp b/clang/test/Parser/cxx0x-lambda-expressions.cpp
index 72b315a497c0..a786a964163e 100644
--- a/clang/test/Parser/cxx0x-lambda-expressions.cpp
+++ b/clang/test/Parser/cxx0x-lambda-expressions.cpp
@@ -1,10 +1,15 @@
-// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify -std=c++11 -Wno-c99-designator %s
-// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify -std=c++20 -Wno-c99-designator %s
-// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify -std=c++23 -Wno-c99-designator %s
+// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx14ext,cxx17ext,cxx20ext,cxx23ext -std=c++03 -Wno-c99-designator %s -Wno-c++11-extensions
+// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx14ext,cxx17ext,cxx20ext,cxx23ext -std=c++11 -Wno-c99-designator %s
+// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx17ext,cxx20ext,cxx23ext          -std=c++14 -Wno-c99-designator %s
+// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx20ext,cxx23ext                   -std=c++17 -Wno-c99-designator %s
+// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected,cxx23ext                            -std=c++20 -Wno-c99-designator %s
+// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=expected                                     -std=c++23 -Wno-c99-designator %s
 
 enum E { e };
 
+#if __cplusplus >= 201103L
 constexpr int id(int n) { return n; }
+#endif
 
 class C {
 
@@ -19,28 +24,25 @@ class C {
     [&,] {}; // expected-error {{expected variable name or 'this' in lambda capture list}}
     [=,] {}; // expected-error {{expected variable name or 'this' in lambda capture list}}
     [] {};
-    [=] (int i) {}; 
-    [&] (int) mutable -> void {}; 
-    [foo,bar] () { return 3; }; 
-    [=,&foo] () {}; 
-    [&,foo] () {}; 
-    [this] () {}; 
+    [=] (int i) {};
+    [&] (int) mutable -> void {};
+    [foo,bar] () { return 3; };
+    [=,&foo] () {};
+    [&,foo] () {};
+    [this] () {};
     [] () -> class C { return C(); };
     [] () -> enum E { return e; };
 
-    [] -> int { return 0; };
-    [] mutable -> int { return 0; };
-#if __cplusplus <= 202002L
-    // expected-warning@-3 {{lambda without a parameter clause is a C++23 extension}}
-    // expected-warning@-3 {{is a C++23 extension}}
-#endif
+    [] -> int { return 0; }; // cxx23ext-warning {{lambda without a parameter clause is a C++23 extension}}
+    [] mutable -> int { return 0; }; // cxx23ext-warning {{is a C++23 extension}}
+
     [](int) -> {}; // PR13652 expected-error {{expected a type}}
     return 1;
   }
 
   void designator_or_lambda() {
-    typedef int T; 
-    const int b = 0; 
+    typedef int T;
+    const int b = 0;
     const int c = 1;
     int d;
     int a1[1] = {[b] (T()) {}}; // expected-error{{no viable conversion from '(lambda}}
@@ -49,19 +51,18 @@ class C {
     int a4[1] = {[&b] = 1 }; // expected-error{{integral constant expression must have integral or unscoped enumeration type, not 'const int *'}}
     int a5[3] = { []{return 0;}() };
     int a6[1] = {[this] = 1 }; // expected-error{{integral constant expression must have integral or unscoped enumeration type, not 'C *'}}
-    int a7[1] = {[d(0)] { return d; } ()};
-    int a8[1] = {[d = 0] { return d; } ()};
-    int a10[1] = {[id(0)] { return id; } ()};
-#if __cplusplus <= 201103L
-    // expected-warning@-4{{extension}}
-    // expected-warning@-4{{extension}}
-    // expected-warning@-4{{extension}}
+    int a7[1] = {[d(0)] { return d; } ()}; // cxx14ext-warning {{initialized lambda captures are a C++14 extension}}
+    int a8[1] = {[d = 0] { return d; } ()}; // cxx14ext-warning {{initialized lambda captures are a C++14 extension}}
+#if __cplusplus >= 201103L
+    int a10[1] = {[id(0)] { return id; } ()}; // cxx14ext-warning {{initialized lambda captures are a C++14 extension}}
 #endif
     int a9[1] = {[d = 0] = 1}; // expected-error{{is not an integral constant expression}}
 #if __cplusplus >= 201402L
     // expected-note@-2{{constant expression cannot modify an object that is visible outside that expression}}
 #endif
+#if __cplusplus >= 201103L
     int a11[1] = {[id(0)] = 1};
+#endif
   }
 
   void delete_lambda(int *p) {
@@ -80,43 +81,33 @@ class C {
   // We support init-captures in C++11 as an extension.
   int z;
   void init_capture() {
-    [n(0)] () mutable -> int { return ++n; };
-    [n{0}] { return; };
-    [a([&b = z]{})](){};
-    [n = 0] { return ++n; }; // expected-error {{captured by copy in a non-mutable}}
-    [n = {0}] { return; }; // expected-error {{<initializer_list>}}
-#if __cplusplus <= 201103L
-    // expected-warning@-6{{extension}}
-    // expected-warning@-6{{extension}}
-    // expected-warning@-6{{extension}}
-    // expected-warning@-7{{extension}}
-    // expected-warning@-7{{extension}}
-    // expected-warning@-7{{extension}}
-#endif
+    [n(0)] () mutable -> int { return ++n; }; // cxx14ext-warning    {{initialized lambda captures are a C++14 extension}}
+    [n{0}] { return; };                       // cxx14ext-warning    {{initialized lambda captures are a C++14 extension}}
+    [a([&b = z]{})](){};                      // cxx14ext-warning 2  {{initialized lambda captures are a C++14 extension}}
+    [n = 0] { return ++n; };                  // expected-error      {{captured by copy in a non-mutable}}
+                                              // cxx14ext-warning@-1 {{initialized lambda captures are a C++14 extension}}
+    [n = {0}] { return; };                    // expected-error      {{<initializer_list>}}
+                                              // cxx14ext-warning@-1 {{initialized lambda captures are a C++14 extension}}
 
     int x = 4;
-    auto y = [&r = x, x = x + 1]() -> int {
-#if __cplusplus <= 201103L
-      // expected-warning@-2{{extension}}
-      // expected-warning@-3{{extension}}
-#endif
+    auto y = [&r = x, x = x + 1]() -> int { // cxx14ext-warning 2 {{initialized lambda captures are a C++14 extension}}
       r += 2;
       return x + 2;
     } ();
   }
 
   void attributes() {
-    [] __attribute__((noreturn)){};
-#if __cplusplus <= 202002L
-    // expected-warning@-2 {{is a C++23 extension}}
-#endif
+    [] __attribute__((noreturn)){}; // cxx23ext-warning {{lambda without a parameter clause is a C++23 extension}}
+
     []() [[]]
       mutable {}; // expected-error {{expected body of lambda expression}}
 
     []() [[]] {};
     []() [[]] -> void {};
     []() mutable [[]] -> void {};
+#if __cplusplus >= 201103L
     []() mutable noexcept [[]] -> void {};
+#endif
 
     // Testing GNU-style attributes on lambdas -- the attribute is specified
     // before the mutable specifier instead of after (unlike C++11).
@@ -126,28 +117,18 @@ class C {
 
     // Testing support for P2173 on adding attributes to the declaration
     // rather than the type.
-    [][[]](){};
-#if __cplusplus <= 202002L
-    // expected-warning@-2 {{an attribute specifier sequence in this position is a C++23 extension}}
-#endif
-#if __cplusplus > 201703L
-    []<typename>[[]](){};
-#if __cplusplus <= 202002L
-    // expected-warning@-2 {{an attribute specifier sequence in this position is a C++23 extension}}
-#endif
-#endif
-    [][[]]{};
-#if __cplusplus <= 202002L
-    // expected-warning@-2 {{an attribute specifier sequence in this position is a C++23 extension}}
-#endif
+    [][[]](){}; // cxx23ext-warning {{an attribute specifier sequence in this position is a C++23 extension}}
+
+    []<typename>[[]](){}; // cxx20ext-warning    {{explicit template parameter list for lambdas is a C++20 extension}}
+                          // cxx23ext-warning@-1 {{an attribute specifier sequence in this position is a C++23 extension}}
+
+    [][[]]{}; // cxx23ext-warning {{an attribute specifier sequence in this position is a C++23 extension}}
   }
 
   void missing_parens() {
-    [] mutable {};
-    [] noexcept {};
-#if __cplusplus <= 202002L
-    // expected-warning@-3 {{is a C++23 extension}}
-    // expected-warning@-3 {{is a C++23 extension}}
+    [] mutable {}; // cxx23ext-warning {{is a C++23 extension}}
+#if __cplusplus >= 201103L
+    [] noexcept {}; // cxx23ext-warning {{is a C++23 extension}}
 #endif
   }
 };
@@ -165,10 +146,7 @@ struct A {
 };
 
 struct S {
-  void mf() { A{[*this]{}}; }
-#if __cplusplus < 201703L
-  // expected-warning@-2 {{C++17 extension}}
-#endif
+  void mf() { A(([*this]{})); } // cxx17ext-warning {{'*this' by copy is a C++17 extension}}
 };
 }
 
diff --git a/clang/test/Parser/cxx2b-lambdas.cpp b/clang/test/Parser/cxx2b-lambdas.cpp
index ad975a17b6e4..758ec9a42f56 100644
--- a/clang/test/Parser/cxx2b-lambdas.cpp
+++ b/clang/test/Parser/cxx2b-lambdas.cpp
@@ -1,30 +1,48 @@
+// RUN: %clang_cc1 -std=c++03 %s -verify                -Wno-c++23-extensions -Wno-c++20-extensions -Wno-c++17-extensions -Wno-c++14-extensions -Wno-c++11-extensions
+// RUN: %clang_cc1 -std=c++11 %s -verify=expected,cxx11 -Wno-c++23-extensions -Wno-c++20-extensions -Wno-c++17-extensions -Wno-c++14-extensions
+// RUN: %clang_cc1 -std=c++14 %s -verify                -Wno-c++23-extensions -Wno-c++20-extensions -Wno-c++17-extensions
+// RUN: %clang_cc1 -std=c++17 %s -verify                -Wno-c++23-extensions -Wno-c++20-extensions
+// RUN: %clang_cc1 -std=c++20 %s -verify                -Wno-c++23-extensions
 // RUN: %clang_cc1 -std=c++23 %s -verify
 
 auto LL0 = [] {};
 auto LL1 = []() {};
 auto LL2 = []() mutable {};
-auto LL3 = []() constexpr {};
+#if __cplusplus >= 201103L
+auto LL3 = []() constexpr {}; // cxx11-error {{return type 'void' is not a literal type}}
+#endif
 
-auto L0 = [] constexpr {};
+#if __cplusplus >= 201103L
+auto L0 = [] constexpr {}; // cxx11-error {{return type 'void' is not a literal type}}
+#endif
 auto L1 = [] mutable {};
+#if __cplusplus >= 201103L
 auto L2 = [] noexcept {};
-auto L3 = [] constexpr mutable {};
-auto L4 = [] mutable constexpr {};
-auto L5 = [] constexpr mutable noexcept {};
+auto L3 = [] constexpr mutable {}; // cxx11-error {{return type 'void' is not a literal type}}
+auto L4 = [] mutable constexpr {}; // cxx11-error {{return type 'void' is not a literal type}}
+auto L5 = [] constexpr mutable noexcept {}; // cxx11-error {{return type 'void' is not a literal type}}
+#endif
 auto L6 = [s = 1] mutable {};
-auto L7 = [s = 1] constexpr mutable noexcept {};
+#if __cplusplus >= 201103L
+auto L7 = [s = 1] constexpr mutable noexcept {}; // cxx11-error {{return type 'void' is not a literal type}}
+#endif
 auto L8 = [] -> bool { return true; };
 auto L9 = []<typename T> { return true; };
+#if __cplusplus >= 201103L
 auto L10 = []<typename T> noexcept { return true; };
+#endif
 auto L11 = []<typename T> -> bool { return true; };
+#if __cplusplus >= 202002L
 auto L12 = [] consteval {};
 auto L13 = []() requires true {}; // expected-error{{non-templated function cannot have a requires clause}}
 auto L14 = []<auto> requires true() requires true {};
 auto L15 = []<auto> requires true noexcept {};
+#endif
 auto L16 = [] [[maybe_unused]]{};
 
-auto XL0 = [] mutable constexpr mutable {};    // expected-error{{cannot appear multiple times}}
-auto XL1 = [] constexpr mutable constexpr {};  // expected-error{{cannot appear multiple times}}
+#if __cplusplus >= 201103L
+auto XL0 = [] mutable constexpr mutable {};    // expected-error{{cannot appear multiple times}} cxx11-error {{return type 'void' is not a literal type}}
+auto XL1 = [] constexpr mutable constexpr {};  // expected-error{{cannot appear multiple times}} cxx11-error {{return type 'void' is not a literal type}}
 auto XL2 = []) constexpr mutable constexpr {}; // expected-error{{expected body of lambda expression}}
 auto XL3 = []( constexpr mutable constexpr {}; // expected-error{{invalid storage class specifier}} \
                                                // expected-error{{function parameter cannot be constexpr}} \
@@ -33,16 +51,23 @@ auto XL3 = []( constexpr mutable constexpr {}; // expected-error{{invalid storag
                                                // expected-note{{to match this '('}} \
                                                // expected-error{{expected body}} \
                                                // expected-warning{{duplicate 'constexpr'}}
+#endif
 
 // http://llvm.org/PR49736
 auto XL4 = [] requires true {}; // expected-error{{expected body}}
+#if __cplusplus >= 201703L
 auto XL5 = []<auto> requires true requires true {}; // expected-error{{expected body}}
 auto XL6 = []<auto> requires true noexcept requires true {}; // expected-error{{expected body}}
+#endif
 
 auto XL7 = []() static static {}; // expected-error {{cannot appear multiple times}}
 auto XL8 = []() static mutable {}; // expected-error {{cannot be both mutable and static}}
+#if __cplusplus >= 202002L
 auto XL9 = []() static consteval {};
-auto XL10 = []() static constexpr {};
+#endif
+#if __cplusplus >= 201103L
+auto XL10 = []() static constexpr {}; // cxx11-error {{return type 'void' is not a literal type}}
+#endif
 
 auto XL11 = [] static {};
 auto XL12 = []() static {};
@@ -67,6 +92,7 @@ void static_captures() {
   };
 }
 
+#if __cplusplus >= 201703L
 constexpr auto static_capture_constexpr() {
   char n = 'n';
   return [n] static { return n; }(); // expected-error {{a static lambda cannot have any captures}}
@@ -78,3 +104,4 @@ constexpr auto capture_constexpr() {
   return [n] { return n; }();
 }
 static_assert(capture_constexpr());
+#endif
diff --git a/clang/test/Parser/objcxx-lambda-expressions-neg.mm b/clang/test/Parser/objcxx-lambda-expressions-neg.mm
index b2fe39dfbf70..795157816dcf 100644
--- a/clang/test/Parser/objcxx-lambda-expressions-neg.mm
+++ b/clang/test/Parser/objcxx-lambda-expressions-neg.mm
@@ -1,13 +1,8 @@
 // RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify %s
-// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify=cxx03 -std=c++98 %s
 // RUN: %clang_cc1 -fsyntax-only -Wno-unused-value -verify -std=c++11 %s
 
 int main() {
-  []{};
-#if __cplusplus <= 199711L
-  // expected-error@-2 {{expected expression}}
-#else
+  []{}; // cxx03-warning {{lambdas are a C++11 extension}}
   // expected-no-diagnostics
-#endif
-
 }
diff --git a/clang/test/ParserHLSL/group_shared.hlsl b/clang/test/ParserHLSL/group_shared.hlsl
index 0b9f28395ee4..44f3a2e5b450 100644
--- a/clang/test/ParserHLSL/group_shared.hlsl
+++ b/clang/test/ParserHLSL/group_shared.hlsl
@@ -3,8 +3,8 @@ extern groupshared float f;
 extern float groupshared f; // Ok, redeclaration?
 
 
-// NOTE:lambda is not enabled except for hlsl202x.
-// expected-error@+2 {{expected expression}}
+// expected-warning@+3 {{lambdas are a C++11 extension}}
+// expected-error@+2   {{expected body of lambda expression}}
 // expected-warning@+1 {{'auto' type specifier is a C++11 extension}}
 auto l = []() groupshared  {};
 
diff --git a/clang/test/Sema/builtin-popcountg.c b/clang/test/Sema/builtin-popcountg.c
deleted file mode 100644
index 9d095927d24e..000000000000
--- a/clang/test/Sema/builtin-popcountg.c
+++ /dev/null
@@ -1,23 +0,0 @@
-// RUN: %clang_cc1 -std=c23 -triple=x86_64-pc-linux-gnu -fsyntax-only -verify -Wpedantic %s
-
-typedef int int2 __attribute__((ext_vector_type(2)));
-
-void test_builtin_popcountg(short s, int i, __int128 i128, _BitInt(128) bi128,
-                            double d, int2 i2) {
-  __builtin_popcountg();
-  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
-  __builtin_popcountg(i, i);
-  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
-  __builtin_popcountg(s);
-  // expected-error@-1 {{1st argument must be a type of unsigned integer (was 'short')}}
-  __builtin_popcountg(i);
-  // expected-error@-1 {{1st argument must be a type of unsigned integer (was 'int')}}
-  __builtin_popcountg(i128);
-  // expected-error@-1 {{1st argument must be a type of unsigned integer (was '__int128')}}
-  __builtin_popcountg(bi128);
-  // expected-error@-1 {{1st argument must be a type of unsigned integer (was '_BitInt(128)')}}
-  __builtin_popcountg(d);
-  // expected-error@-1 {{1st argument must be a type of unsigned integer (was 'double')}}
-  __builtin_popcountg(i2);
-  // expected-error@-1 {{1st argument must be a type of unsigned integer (was 'int2' (vector of 2 'int' values))}}
-}
diff --git a/clang/test/Sema/count-builtins.c b/clang/test/Sema/count-builtins.c
new file mode 100644
index 000000000000..79fa812f3f20
--- /dev/null
+++ b/clang/test/Sema/count-builtins.c
@@ -0,0 +1,87 @@
+// RUN: %clang_cc1 -std=c23 -triple=x86_64-pc-linux-gnu -fsyntax-only -verify -Wpedantic %s
+
+typedef int int2 __attribute__((ext_vector_type(2)));
+
+void test_builtin_popcountg(short s, int i, __int128 i128, _BitInt(128) bi128,
+                            double d, int2 i2) {
+  __builtin_popcountg();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+  __builtin_popcountg(i, i);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+  __builtin_popcountg(s);
+  // expected-error@-1 {{1st argument must be an unsigned integer (was 'short')}}
+  __builtin_popcountg(i);
+  // expected-error@-1 {{1st argument must be an unsigned integer (was 'int')}}
+  __builtin_popcountg(i128);
+  // expected-error@-1 {{1st argument must be an unsigned integer (was '__int128')}}
+  __builtin_popcountg(bi128);
+  // expected-error@-1 {{1st argument must be an unsigned integer (was '_BitInt(128)')}}
+  __builtin_popcountg(d);
+  // expected-error@-1 {{1st argument must be an unsigned integer (was 'double')}}
+  __builtin_popcountg(i2);
+  // expected-error@-1 {{1st argument must be an unsigned integer (was 'int2' (vector of 2 'int' values))}}
+}
+
+void test_builtin_clzg(short s, int i, unsigned int ui, __int128 i128,
+                       _BitInt(128) bi128, double d, int2 i2) {
+  __builtin_clzg();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+  __builtin_clzg(i, i, i);
+  // expected-error@-1 {{too many arguments to function call, expected at most 2, have 3}}
+  __builtin_clzg(s);
+  // expected-error@-1 {{1st argument must be an unsigned integer (was 'short')}}
+  __builtin_clzg(i);
+  // expected-error@-1 {{1st argument must be an unsigned integer (was 'int')}}
+  __builtin_clzg(i128);
+  // expected-error@-1 {{1st argument must be an unsigned integer (was '__int128')}}
+  __builtin_clzg(bi128);
+  // expected-error@-1 {{1st argument must be an unsigned integer (was '_BitInt(128)')}}
+  __builtin_clzg(d);
+  // expected-error@-1 {{1st argument must be an unsigned integer (was 'double')}}
+  __builtin_clzg(i2);
+  // expected-error@-1 {{1st argument must be an unsigned integer (was 'int2' (vector of 2 'int' values))}}
+  __builtin_clzg(i2);
+  // expected-error@-1 {{1st argument must be an unsigned integer (was 'int2' (vector of 2 'int' values))}}
+  __builtin_clzg(ui, ui);
+  // expected-error@-1 {{2nd argument must be an 'int' (was 'unsigned int')}}
+  __builtin_clzg(ui, i128);
+  // expected-error@-1 {{2nd argument must be an 'int' (was '__int128')}}
+  __builtin_clzg(ui, bi128);
+  // expected-error@-1 {{2nd argument must be an 'int' (was '_BitInt(128)')}}
+  __builtin_clzg(ui, d);
+  // expected-error@-1 {{2nd argument must be an 'int' (was 'double')}}
+  __builtin_clzg(ui, i2);
+  // expected-error@-1 {{2nd argument must be an 'int' (was 'int2' (vector of 2 'int' values))}}
+}
+
+void test_builtin_ctzg(short s, int i, unsigned int ui, __int128 i128,
+                       _BitInt(128) bi128, double d, int2 i2) {
+  __builtin_ctzg();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+  __builtin_ctzg(i, i, i);
+  // expected-error@-1 {{too many arguments to function call, expected at most 2, have 3}}
+  __builtin_ctzg(s);
+  // expected-error@-1 {{1st argument must be an unsigned integer (was 'short')}}
+  __builtin_ctzg(i);
+  // expected-error@-1 {{1st argument must be an unsigned integer (was 'int')}}
+  __builtin_ctzg(i128);
+  // expected-error@-1 {{1st argument must be an unsigned integer (was '__int128')}}
+  __builtin_ctzg(bi128);
+  // expected-error@-1 {{1st argument must be an unsigned integer (was '_BitInt(128)')}}
+  __builtin_ctzg(d);
+  // expected-error@-1 {{1st argument must be an unsigned integer (was 'double')}}
+  __builtin_ctzg(i2);
+  // expected-error@-1 {{1st argument must be an unsigned integer (was 'int2' (vector of 2 'int' values))}}
+  __builtin_ctzg(i2);
+  // expected-error@-1 {{1st argument must be an unsigned integer (was 'int2' (vector of 2 'int' values))}}
+  __builtin_ctzg(ui, ui);
+  // expected-error@-1 {{2nd argument must be an 'int' (was 'unsigned int')}}
+  __builtin_ctzg(ui, i128);
+  // expected-error@-1 {{2nd argument must be an 'int' (was '__int128')}}
+  __builtin_ctzg(ui, bi128);
+  // expected-error@-1 {{2nd argument must be an 'int' (was '_BitInt(128)')}}
+  __builtin_ctzg(ui, d);
+  // expected-error@-1 {{2nd argument must be an 'int' (was 'double')}}
+  __builtin_ctzg(ui, i2);
+  // expected-error@-1 {{2nd argument must be an 'int' (was 'int2' (vector of 2 'int' values))}}
+}
diff --git a/clang/test/Sema/warn-cast-function-type-strict.c b/clang/test/Sema/warn-cast-function-type-strict.c
index 8c88f275d2b3..b0a70cf324b7 100644
--- a/clang/test/Sema/warn-cast-function-type-strict.c
+++ b/clang/test/Sema/warn-cast-function-type-strict.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 %s -fsyntax-only -Wcast-function-type -verify
-// RUN: %clang_cc1 %s -fsyntax-only -Wcast-function-type-strict -verify
+// RUN: %clang_cc1 %s -fsyntax-only -Wcast-function-type -verify=expected,strict
+// RUN: %clang_cc1 %s -fsyntax-only -Wcast-function-type-strict -verify=expected,strict
 // RUN: %clang_cc1 %s -fsyntax-only -Wextra -Wno-ignored-qualifiers -verify
 
 int t(int array[static 12]);
@@ -32,13 +32,13 @@ f10 *j;
 void foo(void) {
   a = (f1 *)x;
   b = (f2 *)x; /* expected-warning {{cast from 'int (*)(long)' to 'f2 *' (aka 'int (*)(void *)') converts to incompatible function type}} */
-  c = (f3 *)x; /* expected-warning {{cast from 'int (*)(long)' to 'f3 *' (aka 'int (*)()') converts to incompatible function type}} */
+  c = (f3 *)x; /* strict-warning {{cast from 'int (*)(long)' to 'f3 *' (aka 'int (*)()') converts to incompatible function type}} */
   d = (f4 *)x; /* expected-warning {{cast from 'int (*)(long)' to 'f4 *' (aka 'void (*)()') converts to incompatible function type}} */
-  e = (f5 *)x; /* expected-warning {{cast from 'int (*)(long)' to 'f5 *' (aka 'void (*)(void)') converts to incompatible function type}} */
+  e = (f5 *)x; /* strict-warning {{cast from 'int (*)(long)' to 'f5 *' (aka 'void (*)(void)') converts to incompatible function type}} */
   f = (f6 *)x; /* expected-warning {{cast from 'int (*)(long)' to 'f6 *' (aka 'int (*)(long, int)') converts to incompatible function type}} */
-  g = (f7 *)x; /* expected-warning {{cast from 'int (*)(long)' to 'f7 *' (aka 'int (*)(long, ...)') converts to incompatible function type}} */
+  g = (f7 *)x; /* strict-warning {{cast from 'int (*)(long)' to 'f7 *' (aka 'int (*)(long, ...)') converts to incompatible function type}} */
   h = (f8 *)t;
   i = (f9 *)u;
   // FIXME: return type qualifier should not be included in the function type . Warning should be absent after this issue is fixed. https://github.com/llvm/llvm-project/issues/39494 .
-  j = (f10 *)v; /* expected-warning {{cast from 'const int (*)(int)' to 'f10 *' (aka 'int (*)(int)') converts to incompatible function type}} */
+  j = (f10 *)v; /* strict-warning {{cast from 'const int (*)(int)' to 'f10 *' (aka 'int (*)(int)') converts to incompatible function type}} */
 }
diff --git a/clang/test/SemaCXX/cxx2a-template-lambdas.cpp b/clang/test/SemaCXX/cxx2a-template-lambdas.cpp
index 7ac481369891..fff524e77d3b 100644
--- a/clang/test/SemaCXX/cxx2a-template-lambdas.cpp
+++ b/clang/test/SemaCXX/cxx2a-template-lambdas.cpp
@@ -1,10 +1,14 @@
-// RUN: %clang_cc1 -std=c++2a -verify %s
+// RUN: %clang_cc1 -std=c++03 -verify -Dstatic_assert=_Static_assert -Wno-c++11-extensions -Wno-c++14-extensions -Wno-c++17-extensions -Wno-c++20-extensions %s
+// RUN: %clang_cc1 -std=c++11 -verify=expected,cxx11,cxx11-cxx14 -Wno-c++20-extensions -Wno-c++17-extensions -Wno-c++14-extensions  %s
+// RUN: %clang_cc1 -std=c++14 -verify=expected,cxx11-cxx14,cxx14 -Wno-c++20-extensions -Wno-c++17-extensions %s
+// RUN: %clang_cc1 -std=c++17 -verify -Wno-c++20-extensions %s
+// RUN: %clang_cc1 -std=c++20 -verify %s
 
 template<typename, typename>
-constexpr bool is_same = false;
+inline const bool is_same = false;
 
 template<typename T>
-constexpr bool is_same<T, T> = true;
+inline const bool is_same<T, T> = true;
 
 template<typename T>
 struct DummyTemplate { };
@@ -23,7 +27,7 @@ void func() {
   L1.operator()<6>(); // expected-note {{in instantiation}}
 
   auto L2 = []<template<typename> class T, class U>(T<U> &&arg) {
-    static_assert(is_same<T<U>, DummyTemplate<float>>); // // expected-error {{static assertion failed}}
+    static_assert(is_same<T<U>, DummyTemplate<float> >); // // expected-error {{static assertion failed}}
   };
   L2(DummyTemplate<float>());
   L2(DummyTemplate<double>()); // expected-note {{in instantiation}}
@@ -36,15 +40,20 @@ struct ShadowMe {
   }
 };
 
+#if __cplusplus >= 201102L
 template<typename T>
 constexpr T outer() {
-  return []<T x>() { return x; }.template operator()<123>(); // expected-error {{no matching member function}} \
-                                                                expected-note {{candidate template ignored}}
+  // FIXME: The C++11 error seems wrong
+  return []<T x>() { return x; }.template operator()<123>(); // expected-error {{no matching member function}}  \
+                                                                expected-note {{candidate template ignored}}    \
+        cxx11-note {{non-literal type '<dependent type>' cannot be used in a constant expression}} \
+        cxx14-note {{non-literal type}}
 }
-static_assert(outer<int>() == 123);
+static_assert(outer<int>() == 123); // cxx11-cxx14-error {{not an integral constant expression}} cxx11-cxx14-note {{in call}}
 template int *outer<int *>(); // expected-note {{in instantiation}}
+#endif
 
-
+#if __cplusplus >= 202002L
 namespace GH62611 {
 template <auto A = [](auto x){}>
 struct C {
@@ -87,3 +96,4 @@ void foo() {
 }
 
 }
+#endif
diff --git a/clang/test/SemaCXX/lambda-expressions.cpp b/clang/test/SemaCXX/lambda-expressions.cpp
index 389002ab0e34..151d74f21d64 100644
--- a/clang/test/SemaCXX/lambda-expressions.cpp
+++ b/clang/test/SemaCXX/lambda-expressions.cpp
@@ -1,6 +1,7 @@
-// RUN: %clang_cc1 -std=c++11 -Wno-unused-value -fsyntax-only -verify=expected,expected-cxx14,cxx11 -fblocks %s
-// RUN: %clang_cc1 -std=c++14 -Wno-unused-value -fsyntax-only -verify -verify=expected-cxx14 -fblocks %s
-// RUN: %clang_cc1 -std=c++17 -Wno-unused-value -verify -ast-dump -fblocks %s | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 -Wno-unused-value -fsyntax-only -verify=expected,not-cxx03,cxx03-cxx11,cxx11,expected-cxx14 -fblocks %s
+// RUN: %clang_cc1 -std=c++03 -Wno-unused-value -fsyntax-only -verify=expected,cxx03,cxx03-cxx11,expected-cxx14 -fblocks %s -Ddecltype=__decltype -Dstatic_assert=_Static_assert -Wno-c++11-extensions
+// RUN: %clang_cc1 -std=c++14 -Wno-unused-value -fsyntax-only -verify=expected,not-cxx03,expected-cxx14 -fblocks %s
+// RUN: %clang_cc1 -std=c++17 -Wno-unused-value -verify=expected,not-cxx03 -ast-dump -fblocks %s | FileCheck %s
 
 namespace std { class type_info; };
 
@@ -93,14 +94,14 @@ namespace ImplicitCapture {
     [] { return ref_i; }; // expected-error {{variable 'ref_i' cannot be implicitly captured in a lambda with no capture-default specified}} expected-note {{lambda expression begins here}} expected-note 2 {{capture 'ref_i' by}} expected-note 2 {{default capture by}}
 
     static int j;
-    int &ref_j = j;
-    [] { return ref_j; }; // ok
+    int &ref_j = j; // cxx03-note {{declared here}}
+    [] { return ref_j; }; // cxx03-error {{variable 'ref_j' cannot be implicitly captured in a lambda with no capture-default specified}} cxx03-note 4 {{capture}} cxx03-note {{lambda expression begins here}}
   }
 }
 
 namespace SpecialMembers {
   void f() {
-    auto a = []{}; // expected-note 2{{here}} expected-note 2{{candidate}}
+    auto a = []{}; // expected-note 2{{here}} expected-note {{candidate}} not-cxx03-note {{candidate}}
     decltype(a) b; // expected-error {{no matching constructor}}
     decltype(a) c = a;
     decltype(a) d = static_cast<decltype(a)&&>(a);
@@ -213,7 +214,7 @@ namespace VariadicPackExpansion {
   };
 
   template<typename...Ts> void local_class() {
-    sink {
+    sink s(
       [] (Ts t) {
         struct S : Ts {
           void f(Ts t) {
@@ -226,7 +227,7 @@ namespace VariadicPackExpansion {
         s.f(t);
         return s;
       } (Ts()).g() ...
-    };
+    );
   };
   struct X {}; struct Y {};
   template void local_class<X, Y>();
@@ -296,7 +297,7 @@ namespace PR16708 {
 namespace TypeDeduction {
   struct S {};
   void f() {
-    const S s {};
+    const S s = S();
     S &&t = [&] { return s; } ();
 #if __cplusplus > 201103L
     S &&u = [&] () -> auto { return s; } ();
@@ -308,7 +309,7 @@ namespace TypeDeduction {
 namespace lambdas_in_NSDMIs {
   template<class T>
   struct L {
-      T t{};
+      T t = T();
       T t2 = ([](int a) { return [](int b) { return b; };})(t)(t);
   };
   L<int> l;
@@ -345,6 +346,7 @@ namespace CaptureIncomplete {
   }
 }
 
+#if __cplusplus >= 201103L
 namespace CaptureAbstract {
   struct S {
     virtual void f() = 0; // expected-note {{unimplemented}}
@@ -362,6 +364,7 @@ namespace CaptureAbstract {
     [=] { return s.n; }; // expected-error {{abstract}}
   }
 }
+#endif
 
 namespace PR18128 {
   auto l = [=]{}; // expected-error {{non-local lambda expression cannot have a capture-default}}
@@ -372,6 +375,8 @@ namespace PR18128 {
     // expected-error@-1 {{non-local lambda expression cannot have a capture-default}}
     // expected-error@-2 {{invalid use of non-static data member 'n'}}
     // expected-cxx14-error@-3 {{a lambda expression may not appear inside of a constant expression}}
+    // cxx03-error@-4 {{function declaration cannot have variably modified type}}
+    // cxx03-warning@-5 {{variable length arrays in C++ are a Clang extension}}
     int g(int k = ([=]{ return n; }(), 0));
     // expected-error@-1 {{non-local lambda expression cannot have a capture-default}}
     // expected-error@-2 {{invalid use of non-static data member 'n'}}
@@ -434,13 +439,13 @@ struct A {
 
 template <typename F>
 void g(F f) {
-  auto a = A<decltype(f())>{};
+  auto a = A<decltype(f())>();
   // expected-note@-1 {{in instantiation of template class 'PR20731::A<void>' requested here}}
   auto xf = [a, f]() {};
   int x = sizeof(xf);
 };
 void f() {
-  g([] {});
+  g([] {}); // cxx03-warning {{template argument uses local type}}
   // expected-note-re@-1 {{in instantiation of function template specialization 'PR20731::g<(lambda at {{.*}}>' requested here}}
 }
 
@@ -491,8 +496,8 @@ namespace PR21857 {
     fun() = default;
     using Fn::operator();
   };
-  template<typename Fn> fun<Fn> wrap(Fn fn);
-  auto x = wrap([](){});
+  template<typename Fn> fun<Fn> wrap(Fn fn); // cxx03-warning {{template argument uses unnamed type}}
+  auto x = wrap([](){}); // cxx03-warning {{template argument uses unnamed type}} cxx03-note 2 {{unnamed type used in template argument was declared here}}
 }
 
 namespace PR13987 {
@@ -559,8 +564,8 @@ struct B {
   int x;
   A a = [&] { int y = x; };
   A b = [&] { [&] { [&] { int y = x; }; }; };
-  A d = [&](auto param) { int y = x; }; // cxx11-error {{'auto' not allowed in lambda parameter}}
-  A e = [&](auto param) { [&] { [&](auto param2) { int y = x; }; }; }; // cxx11-error 2 {{'auto' not allowed in lambda parameter}}
+  A d = [&](auto param) { int y = x; }; // cxx03-cxx11-error {{'auto' not allowed in lambda parameter}}
+  A e = [&](auto param) { [&] { [&](auto param2) { int y = x; }; }; }; // cxx03-cxx11-error 2 {{'auto' not allowed in lambda parameter}}
 };
 
 B<int> b;
@@ -588,9 +593,9 @@ struct S1 {
 };
 
 void foo1() {
-  auto s0 = S1{[name=]() {}}; // expected-error 2 {{expected expression}}
-  auto s1 = S1{[name=name]() {}}; // expected-error {{use of undeclared identifier 'name'; did you mean 'name1'?}}
-                                  // cxx11-warning@-1 {{initialized lambda captures are a C++14 extension}}
+  auto s0 = S1([name=]() {}); // expected-error {{expected expression}}
+  auto s1 = S1([name=name]() {}); // expected-error {{use of undeclared identifier 'name'; did you mean 'name1'?}}
+                                  // cxx03-cxx11-warning@-1 {{initialized lambda captures are a C++14 extension}}
 }
 }
 
@@ -606,7 +611,7 @@ namespace PR25627_dont_odr_use_local_consts {
 
 namespace ConversionOperatorDoesNotHaveDeducedReturnType {
   auto x = [](int){};
-  auto y = [](auto &v) -> void { v.n = 0; }; // cxx11-error {{'auto' not allowed in lambda parameter}} cxx11-note {{candidate function not viable}} cxx11-note {{conversion candidate}}
+  auto y = [](auto &v) -> void { v.n = 0; }; // cxx03-cxx11-error {{'auto' not allowed in lambda parameter}} cxx03-cxx11-note {{candidate function not viable}} cxx03-cxx11-note {{conversion candidate}}
   using T = decltype(x);
   using U = decltype(y);
   using ExpectedTypeT = void (*)(int);
@@ -626,14 +631,15 @@ namespace ConversionOperatorDoesNotHaveDeducedReturnType {
     template<typename T>
       friend constexpr U::operator ExpectedTypeU<T>() const noexcept;
 #else
-    friend auto T::operator()(int) const; // cxx11-error {{'auto' return without trailing return type; deduced return types are a C++14 extension}}
+    friend auto T::operator()(int) const; // cxx11-error {{'auto' return without trailing return type; deduced return types are a C++14 extension}} \
+                                             cxx03-error {{'auto' not allowed in function return type}}
     friend T::operator ExpectedTypeT() const;
 
     template<typename T>
-      friend void U::operator()(T&) const; // cxx11-error {{friend declaration of 'operator()' does not match any declaration}}
+      friend void U::operator()(T&) const; // cxx03-cxx11-error {{friend declaration of 'operator()' does not match any declaration}}
     // FIXME: This should not match, as above.
     template<typename T>
-      friend U::operator ExpectedTypeU<T>() const; // cxx11-error {{friend declaration of 'operator void (*)(type-parameter-0-0 &)' does not match any declaration}}
+      friend U::operator ExpectedTypeU<T>() const; // cxx03-cxx11-error {{friend declaration of 'operator void (*)(type-parameter-0-0 &)' does not match any declaration}}
 #endif
 
   private:
@@ -641,7 +647,7 @@ namespace ConversionOperatorDoesNotHaveDeducedReturnType {
   };
 
   // Should be OK in C++14 and later: lambda's call operator is a friend.
-  void use(X &x) { y(x); } // cxx11-error {{no matching function for call to object}}
+  void use(X &x) { y(x); } // cxx03-cxx11-error {{no matching function for call to object}}
 
   // This used to crash in return type deduction for the conversion opreator.
   struct A { int n; void f() { +[](decltype(n)) {}; } };
@@ -682,8 +688,8 @@ namespace GH60518 {
 // function parameters that are used in enable_if
 struct StringLiteral {
 template <int N>
-StringLiteral(const char (&array)[N])
-    __attribute__((enable_if(__builtin_strlen(array) == 2,
+StringLiteral(const char (&array)[N]) // cxx03-note {{declared here}}
+    __attribute__((enable_if(__builtin_strlen(array) == 2, // cxx03-error {{'enable_if' attribute expression never produces a constant expression}} cxx03-note {{read of variable}}
                               "invalid string literal")));
 };
 
@@ -695,7 +701,7 @@ StringLiteral(const char (&array)[N]) [[clang::annotate_type("test", array)]];
 }
 
 void Func1() {
-  [[maybe_unused]] auto y = [&](decltype(StringLiteral("xx"))) {};
+  [[maybe_unused]] auto y = [&](decltype(StringLiteral("xx"))) {}; // cxx03-note {{in instantiation of function template specialization}}
   [[maybe_unused]] auto z = [&](decltype(cpp_attribute::StringLiteral("xx"))) {};
 }
 
@@ -718,6 +724,7 @@ static_assert([]() constexpr {
 
 // Call operator attributes refering to a variable should
 // be properly handled after D124351
+#if __cplusplus >= 201103L
 constexpr int i = 2;
 void foo() {
   (void)[=][[gnu::aligned(i)]] () {}; // expected-warning{{C++23 extension}}
@@ -725,15 +732,18 @@ void foo() {
   // CHECK-NEXT: ConstantExpr
   // CHECK-NEXT: value: Int 2
 }
+#endif
 
 void GH48527() {
   auto a = []()__attribute__((b(({ return 0; })))){}; // expected-warning {{unknown attribute 'b' ignored}}
 }
 
+#if __cplusplus >= 201103L
 void GH67492() {
   constexpr auto test = 42;
   auto lambda = (test, []() noexcept(true) {});
 }
+#endif
 
 // FIXME: This currently causes clang to crash in C++11 mode.
 #if __cplusplus >= 201402L
diff --git a/clang/test/SemaCXX/lambda-implicit-this-capture.cpp b/clang/test/SemaCXX/lambda-implicit-this-capture.cpp
index 7e0e347a8fee..eb1f9e880aec 100644
--- a/clang/test/SemaCXX/lambda-implicit-this-capture.cpp
+++ b/clang/test/SemaCXX/lambda-implicit-this-capture.cpp
@@ -1,3 +1,4 @@
+// RUN: %clang_cc1 -std=c++03 -verify=cxx11 %s -Wno-c++11-extensions
 // RUN: %clang_cc1 -std=c++11 -verify=cxx11 %s
 // RUN: %clang_cc1 -std=c++2a -verify=cxx2a %s
 // RUN: %clang_cc1 -std=c++2a -verify=cxx2a-no-deprecated %s -Wno-deprecated
diff --git a/clang/test/SemaCXX/lambda-invalid-capture.cpp b/clang/test/SemaCXX/lambda-invalid-capture.cpp
index 236753871d70..5be8c8c5078f 100644
--- a/clang/test/SemaCXX/lambda-invalid-capture.cpp
+++ b/clang/test/SemaCXX/lambda-invalid-capture.cpp
@@ -1,3 +1,4 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++03 -Wno-c++11-extensions %s
 // RUN: %clang_cc1 -fsyntax-only -verify %s
 // Don't crash.
 
diff --git a/clang/test/SemaCXX/new-delete.cpp b/clang/test/SemaCXX/new-delete.cpp
index 4f78b7c71a91..1a99c6aac604 100644
--- a/clang/test/SemaCXX/new-delete.cpp
+++ b/clang/test/SemaCXX/new-delete.cpp
@@ -171,12 +171,7 @@ void good_deletes()
 void bad_deletes()
 {
   delete 0; // expected-error {{cannot delete expression of type 'int'}}
-  delete [0] (int*)0;
-#if __cplusplus <= 199711L
-  // expected-error@-2 {{expected expression}}
-#else
-  // expected-error@-4 {{expected variable name or 'this' in lambda capture list}}
-#endif
+  delete [0] (int*)0; // expected-error {{expected variable name or 'this' in lambda capture list}}
   delete (void*)0; // expected-warning {{cannot delete expression with pointer-to-'void' type 'void *'}}
   delete (T*)0; // expected-warning {{deleting pointer to incomplete type}}
   ::S::delete (int*)0; // expected-error {{expected unqualified-id}}
diff --git a/clang/test/SemaCXX/warn-cast-function-type-strict.cpp b/clang/test/SemaCXX/warn-cast-function-type-strict.cpp
index b3164afde5a0..8887b3c4c5d5 100644
--- a/clang/test/SemaCXX/warn-cast-function-type-strict.cpp
+++ b/clang/test/SemaCXX/warn-cast-function-type-strict.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 %s -fblocks -fsyntax-only -Wcast-function-type -verify
-// RUN: %clang_cc1 %s -fblocks -fsyntax-only -Wcast-function-type-strict -verify
+// RUN: %clang_cc1 %s -fblocks -fsyntax-only -Wcast-function-type -verify=expected,strict
+// RUN: %clang_cc1 %s -fblocks -fsyntax-only -Wcast-function-type-strict -verify=expected,strict
 // RUN: %clang_cc1 %s -fblocks -fsyntax-only -Wextra -verify
 
 int x(long);
@@ -33,11 +33,11 @@ void foo() {
   a = (f1 *)x;
   b = (f2 *)x; // expected-warning {{cast from 'int (*)(long)' to 'f2 *' (aka 'int (*)(void *)') converts to incompatible function type}}
   b = reinterpret_cast<f2 *>(x); // expected-warning {{cast from 'int (*)(long)' to 'f2 *' (aka 'int (*)(void *)') converts to incompatible function type}}
-  c = (f3 *)x; // expected-warning {{cast from 'int (*)(long)' to 'f3 *' (aka 'int (*)(...)') converts to incompatible function type}}
+  c = (f3 *)x; // strict-warning {{cast from 'int (*)(long)' to 'f3 *' (aka 'int (*)(...)') converts to incompatible function type}}
   d = (f4 *)x; // expected-warning {{cast from 'int (*)(long)' to 'f4 *' (aka 'void (*)(...)') converts to incompatible function type}}
-  e = (f5 *)x; // expected-warning {{cast from 'int (*)(long)' to 'f5 *' (aka 'void (*)()') converts to incompatible function type}}
+  e = (f5 *)x; // strict-warning {{cast from 'int (*)(long)' to 'f5 *' (aka 'void (*)()') converts to incompatible function type}}
   f = (f6 *)x; // expected-warning {{cast from 'int (*)(long)' to 'f6 *' (aka 'int (*)(long, int)') converts to incompatible function type}}
-  g = (f7 *)x; // expected-warning {{cast from 'int (*)(long)' to 'f7 *' (aka 'int (*)(long, ...)') converts to incompatible function type}}
+  g = (f7 *)x; // strict-warning {{cast from 'int (*)(long)' to 'f7 *' (aka 'int (*)(long, ...)') converts to incompatible function type}}
 
   mf p1 = (mf)&S::foo; // expected-warning {{cast from 'void (S::*)(int *)' to 'mf' (aka 'void (S::*)(int)') converts to incompatible function type}}
 
diff --git a/clang/test/SemaObjC/attr-objc-NSObject.m b/clang/test/SemaObjC/attr-objc-NSObject.m
new file mode 100644
index 000000000000..76a01dcef016
--- /dev/null
+++ b/clang/test/SemaObjC/attr-objc-NSObject.m
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -verify -Wno-objc-root-class -fsyntax-only %s
+
+@interface NSArray<__covariant ObjectType>
+- (void)containsObject:(ObjectType)anObject; // expected-note {{passing argument to parameter 'anObject' here}}
+- (void)description;
+@end
+
+typedef __attribute__((NSObject)) struct Foo *FooRef;
+typedef struct Bar *BarRef;
+
+void good() {
+  FooRef object;
+  NSArray<FooRef> *array;
+  [array containsObject:object];
+  [object description];
+}
+
+void bad() {
+  BarRef object;
+  NSArray<BarRef> *array; // expected-error {{type argument 'BarRef' (aka 'struct Bar *') is neither an Objective-C object nor a block type}}
+  [array containsObject:object]; // expected-warning {{incompatible pointer types sending 'BarRef' (aka 'struct Bar *') to parameter of type 'id'}}
+  [object description]; // expected-warning {{receiver type 'BarRef' (aka 'struct Bar *') is not 'id' or interface pointer, consider casting it to 'id'}}
+}
diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp
index e122cea50f72..ed401135ad84 100644
--- a/clang/tools/clang-format/ClangFormat.cpp
+++ b/clang/tools/clang-format/ClangFormat.cpp
@@ -205,6 +205,11 @@ static cl::list<std::string> FileNames(cl::Positional,
                                        cl::desc("[@<file>] [<file> ...]"),
                                        cl::cat(ClangFormatCategory));
 
+static cl::opt<bool> FailOnIncompleteFormat(
+    "fail-on-incomplete-format",
+    cl::desc("If set, fail with exit code 1 on incomplete format."),
+    cl::init(false), cl::cat(ClangFormatCategory));
+
 namespace clang {
 namespace format {
 
@@ -399,7 +404,7 @@ class ClangFormatDiagConsumer : public DiagnosticConsumer {
 };
 
 // Returns true on error.
-static bool format(StringRef FileName) {
+static bool format(StringRef FileName, bool ErrorOnIncompleteFormat = false) {
   const bool IsSTDIN = FileName == "-";
   if (!OutputXML && Inplace && IsSTDIN) {
     errs() << "error: cannot use -i when reading from stdin.\n";
@@ -535,7 +540,7 @@ static bool format(StringRef FileName) {
       Rewrite.getEditBuffer(ID).write(outs());
     }
   }
-  return false;
+  return ErrorOnIncompleteFormat && !Status.FormatComplete;
 }
 
 } // namespace format
@@ -699,7 +704,7 @@ int main(int argc, const char **argv) {
   }
 
   if (FileNames.empty())
-    return clang::format::format("-");
+    return clang::format::format("-", FailOnIncompleteFormat);
 
   if (FileNames.size() > 1 &&
       (!Offsets.empty() || !Lengths.empty() || !LineRanges.empty())) {
@@ -717,7 +722,7 @@ int main(int argc, const char **argv) {
       errs() << "Formatting [" << FileNo++ << "/" << FileNames.size() << "] "
              << FileName << "\n";
     }
-    Error |= clang::format::format(FileName);
+    Error |= clang::format::format(FileName, FailOnIncompleteFormat);
   }
   return Error ? 1 : 0;
 }
diff --git a/clang/tools/clang-installapi/ClangInstallAPI.cpp b/clang/tools/clang-installapi/ClangInstallAPI.cpp
index 54e82d78d4d2..13061cfa36ee 100644
--- a/clang/tools/clang-installapi/ClangInstallAPI.cpp
+++ b/clang/tools/clang-installapi/ClangInstallAPI.cpp
@@ -123,7 +123,7 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
     }
   }
 
-  if (Ctx.Verifier->getState() == DylibVerifier::Result::Invalid)
+  if (Ctx.Verifier->verifyRemainingSymbols() == DylibVerifier::Result::Invalid)
     return EXIT_FAILURE;
 
   // After symbols have been collected, prepare to write output.
diff --git a/clang/unittests/StaticAnalyzer/CMakeLists.txt b/clang/unittests/StaticAnalyzer/CMakeLists.txt
index 775f0f8486b8..519be36fe0fa 100644
--- a/clang/unittests/StaticAnalyzer/CMakeLists.txt
+++ b/clang/unittests/StaticAnalyzer/CMakeLists.txt
@@ -11,6 +11,7 @@ add_clang_unittest(StaticAnalysisTests
   CallEventTest.cpp
   ConflictingEvalCallsTest.cpp
   FalsePositiveRefutationBRVisitorTest.cpp
+  MemRegionDescriptiveNameTest.cpp
   NoStateChangeFuncVisitorTest.cpp
   ParamRegionTest.cpp
   RangeSetTest.cpp
diff --git a/clang/unittests/StaticAnalyzer/MemRegionDescriptiveNameTest.cpp b/clang/unittests/StaticAnalyzer/MemRegionDescriptiveNameTest.cpp
new file mode 100644
index 000000000000..ba0c4d25e13b
--- /dev/null
+++ b/clang/unittests/StaticAnalyzer/MemRegionDescriptiveNameTest.cpp
@@ -0,0 +1,145 @@
+//===- MemRegionDescriptiveNameTest.cpp -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CheckerRegistration.h"
+#include "clang/StaticAnalyzer/Core/Checker.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h"
+#include "gtest/gtest.h"
+#include <fstream>
+
+using namespace clang;
+using namespace ento;
+
+namespace {
+
+class DescriptiveNameChecker : public Checker<check::PreCall> {
+public:
+  void checkPreCall(const CallEvent &Call, CheckerContext &C) const {
+    if (!HandlerFn.matches(Call))
+      return;
+
+    const MemRegion *ArgReg = Call.getArgSVal(0).getAsRegion();
+    assert(ArgReg && "expecting a location as the first argument");
+
+    auto DescriptiveName = ArgReg->getDescriptiveName(/*UseQuotes=*/false);
+    if (ExplodedNode *Node = C.generateNonFatalErrorNode(C.getState())) {
+      auto Report =
+          std::make_unique<PathSensitiveBugReport>(Bug, DescriptiveName, Node);
+      C.emitReport(std::move(Report));
+    }
+  }
+
+private:
+  const BugType Bug{this, "DescriptiveNameBug"};
+  const CallDescription HandlerFn = {{"reportDescriptiveName"}, 1};
+};
+
+void addDescriptiveNameChecker(AnalysisASTConsumer &AnalysisConsumer,
+                               AnalyzerOptions &AnOpts) {
+  AnOpts.CheckersAndPackages = {{"DescriptiveNameChecker", true}};
+  AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) {
+    Registry.addChecker<DescriptiveNameChecker>("DescriptiveNameChecker",
+                                                "Desc", "DocsURI");
+  });
+}
+
+bool runChecker(StringRef Code, std::string &Output) {
+  return runCheckerOnCode<addDescriptiveNameChecker>(Code.str(), Output,
+                                                     /*OnlyEmitWarnings=*/true);
+}
+
+TEST(MemRegionDescriptiveNameTest, ConcreteIntElementRegionIndex) {
+  StringRef Code = R"cpp(
+void reportDescriptiveName(int *p);
+const unsigned int index = 1;
+extern int array[3];
+void top() {
+  reportDescriptiveName(&array[index]);
+})cpp";
+
+  std::string Output;
+  ASSERT_TRUE(runChecker(Code, Output));
+  EXPECT_EQ(Output, "DescriptiveNameChecker: array[1]\n");
+}
+
+TEST(MemRegionDescriptiveNameTest, SymbolicElementRegionIndex) {
+  StringRef Code = R"cpp(
+void reportDescriptiveName(int *p);
+extern unsigned int index;
+extern int array[3];
+void top() {
+  reportDescriptiveName(&array[index]);
+})cpp";
+
+  std::string Output;
+  ASSERT_TRUE(runChecker(Code, Output));
+  EXPECT_EQ(Output, "DescriptiveNameChecker: array[index]\n");
+}
+
+TEST(MemRegionDescriptiveNameTest, SymbolicElementRegionIndexSymbolValFails) {
+  StringRef Code = R"cpp(
+void reportDescriptiveName(int *p);
+extern int* ptr;
+extern int array[3];
+void top() {
+  reportDescriptiveName(&array[(long)ptr]);
+})cpp";
+
+  std::string Output;
+  ASSERT_TRUE(runChecker(Code, Output));
+  EXPECT_EQ(Output, "DescriptiveNameChecker: \n");
+}
+
+TEST(MemRegionDescriptiveNameTest, SymbolicElementRegionIndexOrigRegionFails) {
+  StringRef Code = R"cpp(
+void reportDescriptiveName(int *p);
+extern int getInt(void);
+extern int array[3];
+void top() {
+  reportDescriptiveName(&array[getInt()]);
+})cpp";
+
+  std::string Output;
+  ASSERT_TRUE(runChecker(Code, Output));
+  EXPECT_EQ(Output, "DescriptiveNameChecker: \n");
+}
+
+TEST(MemRegionDescriptiveNameTest, SymbolicElementRegionIndexDescrNameFails) {
+  StringRef Code = R"cpp(
+void reportDescriptiveName(int *p);
+extern int *ptr;
+extern int array[3];
+void top() {
+  reportDescriptiveName(&array[*ptr]);
+})cpp";
+
+  std::string Output;
+  ASSERT_TRUE(runChecker(Code, Output));
+  EXPECT_EQ(Output, "DescriptiveNameChecker: \n");
+}
+
+TEST(MemRegionDescriptiveNameTest,
+     SymbolicElementRegionIndexIncorrectSymbolName) {
+  StringRef Code = R"cpp(
+void reportDescriptiveName(int *p);
+extern int x, y;
+extern int array[3];
+void top() {
+  y = x;
+  reportDescriptiveName(&array[y]);
+})cpp";
+
+  std::string Output;
+  ASSERT_TRUE(runChecker(Code, Output));
+  // FIXME: Should return array[y], but returns array[x] (OriginRegion).
+  EXPECT_EQ(Output, "DescriptiveNameChecker: array[x]\n");
+}
+
+} // namespace
diff --git a/compiler-rt/cmake/Modules/CompilerRTCompile.cmake b/compiler-rt/cmake/Modules/CompilerRTCompile.cmake
index d5b2e7970f11..1629db18f1c2 100644
--- a/compiler-rt/cmake/Modules/CompilerRTCompile.cmake
+++ b/compiler-rt/cmake/Modules/CompilerRTCompile.cmake
@@ -46,7 +46,7 @@ function(sanitizer_test_compile obj_list source arch)
   # Write out architecture-specific flags into TARGET_CFLAGS variable.
   get_target_flags_for_arch(${arch} TARGET_CFLAGS)
   set(COMPILE_DEPS ${TEST_COMPILE_DEPS})
-  if(NOT COMPILER_RT_STANDALONE_BUILD)
+  if(NOT COMPILER_RT_STANDALONE_BUILD OR COMPILER_RT_TEST_STANDALONE_BUILD_LIBS)
     list(APPEND COMPILE_DEPS ${TEST_DEPS})
   endif()
   clang_compile(${output_obj} ${source}
diff --git a/compiler-rt/lib/msan/tests/CMakeLists.txt b/compiler-rt/lib/msan/tests/CMakeLists.txt
index 4f09f1e6a691..e0771dd5a1a7 100644
--- a/compiler-rt/lib/msan/tests/CMakeLists.txt
+++ b/compiler-rt/lib/msan/tests/CMakeLists.txt
@@ -69,9 +69,9 @@ macro(msan_compile obj_list source arch kind cflags)
   sanitizer_test_compile(
     ${obj_list} ${source} ${arch}
     KIND ${kind}
-    COMPILE_DEPS ${MSAN_UNITTEST_HEADERS}
+    COMPILE_DEPS ${MSAN_UNITTEST_HEADERS} libcxx_msan_${arch}-build
     DEPS msan
-    CFLAGS -isystem ${CMAKE_CURRENT_BINARY_DIR}/../libcxx_msan_${arch}/include/c++/v1
+    CFLAGS -isystem ${MSAN_LIBCXX_DIR}/../include/c++/v1
            ${MSAN_UNITTEST_INSTRUMENTED_CFLAGS} ${cflags}
   )
 endmacro()
@@ -120,7 +120,7 @@ macro(add_msan_tests_for_arch arch kind cflags)
   set(MSAN_TEST_DEPS ${MSAN_TEST_OBJECTS} libcxx_msan_${arch}-build
                      ${MSAN_LOADABLE_SO}
 		     "${MSAN_LIBCXX_DIR}/libc++.a" "${MSAN_LIBCXX_DIR}/libc++abi.a")
-  list(APPEND MSAN_TEST_DEPS msan)
+  list(APPEND MSAN_TEST_DEPS msan libcxx_msan_${arch}-build)
   get_target_flags_for_arch(${arch} TARGET_LINK_FLAGS)
   add_compiler_rt_test(MsanUnitTests "Msan-${arch}${kind}-Test" ${arch}
     OBJECTS ${MSAN_TEST_OBJECTS} "${MSAN_LIBCXX_DIR}/libc++.a" "${MSAN_LIBCXX_DIR}/libc++abi.a"
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_printer.h b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_printer.h
index 10361a320344..e39cb891575e 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_printer.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace_printer.h
@@ -30,10 +30,15 @@ class StackTracePrinter {
 
   virtual void RenderFrame(InternalScopedString *buffer, const char *format,
                            int frame_no, uptr address, const AddressInfo *info,
-                           bool vs_style,
-                           const char *strip_path_prefix = "") = 0;
+                           bool vs_style, const char *strip_path_prefix = "") {
+    // Should be pure virtual, but we can't depend on __cxa_pure_virtual.
+    UNIMPLEMENTED();
+  }
 
-  virtual bool RenderNeedsSymbolization(const char *format) = 0;
+  virtual bool RenderNeedsSymbolization(const char *format) {
+    // Should be pure virtual, but we can't depend on __cxa_pure_virtual.
+    UNIMPLEMENTED();
+  }
 
   void RenderSourceLocation(InternalScopedString *buffer, const char *file,
                             int line, int column, bool vs_style,
@@ -44,7 +49,10 @@ class StackTracePrinter {
                             const char *strip_path_prefix);
   virtual void RenderData(InternalScopedString *buffer, const char *format,
                           const DataInfo *DI,
-                          const char *strip_path_prefix = "") = 0;
+                          const char *strip_path_prefix = "") {
+    // Should be pure virtual, but we can't depend on __cxa_pure_virtual.
+    UNIMPLEMENTED();
+  }
 
  private:
   // To be called from StackTracePrinter::GetOrInit
diff --git a/compiler-rt/lib/tsan/tests/CMakeLists.txt b/compiler-rt/lib/tsan/tests/CMakeLists.txt
index ad8cc9b0eb05..1bc08bbf7450 100644
--- a/compiler-rt/lib/tsan/tests/CMakeLists.txt
+++ b/compiler-rt/lib/tsan/tests/CMakeLists.txt
@@ -67,7 +67,7 @@ endforeach()
 set(TSAN_DEPS tsan)
 # TSan uses C++ standard library headers.
 if (TARGET cxx-headers OR HAVE_LIBCXX)
-  set(TSAN_DEPS cxx-headers)
+  list(APPEND TSAN_DEPS cxx-headers)
 endif()
 
 # add_tsan_unittest(<name>
diff --git a/compiler-rt/test/dfsan/custom.cpp b/compiler-rt/test/dfsan/custom.cpp
index 4bb818813cf7..f544e481b726 100644
--- a/compiler-rt/test/dfsan/custom.cpp
+++ b/compiler-rt/test/dfsan/custom.cpp
@@ -175,7 +175,7 @@ void test_stat() {
 
   s.st_dev = i;
   SAVE_ORIGINS(s)
-  ret = stat("/nonexistent", &s);
+  ret = stat("/nonexistent_581cb021aba7", &s);
   assert(-1 == ret);
   ASSERT_ZERO_LABEL(ret);
   ASSERT_LABEL(s.st_dev, i_label);
diff --git a/flang/include/flang/Common/Version.h b/flang/include/flang/Common/Version.h
index b1bd2416a618..3257d4a4f645 100644
--- a/flang/include/flang/Common/Version.h
+++ b/flang/include/flang/Common/Version.h
@@ -12,8 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_FLANG_COMMON_VERSION_H
-#define LLVM_FLANG_COMMON_VERSION_H
+#ifndef FORTRAN_COMMON_VERSION_H
+#define FORTRAN_COMMON_VERSION_H
 
 #include "flang/Version.inc"
 #include "llvm/ADT/StringRef.h"
@@ -53,4 +53,4 @@ std::string getFlangFullVersion();
 std::string getFlangToolFullVersion(llvm::StringRef ToolName);
 } // namespace Fortran::common
 
-#endif // LLVM_FLANG_COMMON_VERSION_H
+#endif // FORTRAN_COMMON_VERSION_H
diff --git a/flang/include/flang/Frontend/CodeGenOptions.h b/flang/include/flang/Frontend/CodeGenOptions.h
index 0c318e4023af..b0bbace82c04 100644
--- a/flang/include/flang/Frontend/CodeGenOptions.h
+++ b/flang/include/flang/Frontend/CodeGenOptions.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_BASIC_CODEGENOPTIONS_H
-#define LLVM_CLANG_BASIC_CODEGENOPTIONS_H
+#ifndef FORTRAN_FRONTEND_CODEGENOPTIONS_H
+#define FORTRAN_FRONTEND_CODEGENOPTIONS_H
 
 #include "llvm/Frontend/Debug/Options.h"
 #include "llvm/Frontend/Driver/CodeGenOptions.h"
@@ -141,4 +141,4 @@ public:
 
 } // end namespace Fortran::frontend
 
-#endif
+#endif // FORTRAN_FRONTEND_CODEGENOPTIONS_H
diff --git a/flang/include/flang/Frontend/LangOptions.h b/flang/include/flang/Frontend/LangOptions.h
index 7adf2eec9ca3..7ab219581886 100644
--- a/flang/include/flang/Frontend/LangOptions.h
+++ b/flang/include/flang/Frontend/LangOptions.h
@@ -12,8 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_FLANG_FRONTEND_LANGOPTIONS_H
-#define LLVM_FLANG_FRONTEND_LANGOPTIONS_H
+#ifndef FORTRAN_FRONTEND_LANGOPTIONS_H
+#define FORTRAN_FRONTEND_LANGOPTIONS_H
 
 #include <string>
 
@@ -63,4 +63,4 @@ public:
 
 } // end namespace Fortran::frontend
 
-#endif
+#endif // FORTRAN_FRONTEND_LANGOPTIONS_H
diff --git a/flang/lib/Lower/ConvertConstant.cpp b/flang/lib/Lower/ConvertConstant.cpp
index 336944d35b7e..ed389bbe4ae5 100644
--- a/flang/lib/Lower/ConvertConstant.cpp
+++ b/flang/lib/Lower/ConvertConstant.cpp
@@ -14,9 +14,12 @@
 #include "flang/Evaluate/expression.h"
 #include "flang/Lower/AbstractConverter.h"
 #include "flang/Lower/BuiltinModules.h"
+#include "flang/Lower/ConvertExprToHLFIR.h"
 #include "flang/Lower/ConvertType.h"
 #include "flang/Lower/ConvertVariable.h"
 #include "flang/Lower/Mangler.h"
+#include "flang/Lower/StatementContext.h"
+#include "flang/Lower/SymbolMap.h"
 #include "flang/Optimizer/Builder/Complex.h"
 #include "flang/Optimizer/Builder/MutableBox.h"
 #include "flang/Optimizer/Builder/Todo.h"
@@ -380,10 +383,21 @@ static mlir::Value genStructureComponentInit(
   }
 
   if (Fortran::semantics::IsPointer(sym)) {
-    if (Fortran::semantics::IsProcedure(sym))
-      TODO(loc, "procedure pointer component initial value");
-    mlir::Value initialTarget =
-        Fortran::lower::genInitialDataTarget(converter, loc, componentTy, expr);
+    mlir::Value initialTarget;
+    if (Fortran::semantics::IsProcedure(sym)) {
+      if (Fortran::evaluate::UnwrapExpr<Fortran::evaluate::NullPointer>(expr))
+        initialTarget =
+            fir::factory::createNullBoxProc(builder, loc, componentTy);
+      else {
+        Fortran::lower::SymMap globalOpSymMap;
+        Fortran::lower::StatementContext stmtCtx;
+        auto box{getBase(Fortran::lower::convertExprToAddress(
+            loc, converter, expr, globalOpSymMap, stmtCtx))};
+        initialTarget = builder.createConvert(loc, componentTy, box);
+      }
+    } else
+      initialTarget = Fortran::lower::genInitialDataTarget(converter, loc,
+                                                           componentTy, expr);
     res = builder.create<fir::InsertValueOp>(
         loc, recTy, res, initialTarget,
         builder.getArrayAttr(field.getAttributes()));
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 95faa0767e36..52c3479b1ea9 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -208,6 +208,25 @@ addUseDeviceClause(Fortran::lower::AbstractConverter &converter,
     useDeviceSymbols.push_back(object.id());
 }
 
+static void convertLoopBounds(Fortran::lower::AbstractConverter &converter,
+                              mlir::Location loc,
+                              llvm::SmallVectorImpl<mlir::Value> &lowerBound,
+                              llvm::SmallVectorImpl<mlir::Value> &upperBound,
+                              llvm::SmallVectorImpl<mlir::Value> &step,
+                              std::size_t loopVarTypeSize) {
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  // The types of lower bound, upper bound, and step are converted into the
+  // type of the loop variable if necessary.
+  mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
+  for (unsigned it = 0; it < (unsigned)lowerBound.size(); it++) {
+    lowerBound[it] =
+        firOpBuilder.createConvert(loc, loopVarType, lowerBound[it]);
+    upperBound[it] =
+        firOpBuilder.createConvert(loc, loopVarType, upperBound[it]);
+    step[it] = firOpBuilder.createConvert(loc, loopVarType, step[it]);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // ClauseProcessor unique clauses
 //===----------------------------------------------------------------------===//
@@ -217,8 +236,7 @@ bool ClauseProcessor::processCollapse(
     llvm::SmallVectorImpl<mlir::Value> &lowerBound,
     llvm::SmallVectorImpl<mlir::Value> &upperBound,
     llvm::SmallVectorImpl<mlir::Value> &step,
-    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &iv,
-    std::size_t &loopVarTypeSize) const {
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &iv) const {
   bool found = false;
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
@@ -236,7 +254,7 @@ bool ClauseProcessor::processCollapse(
     found = true;
   }
 
-  loopVarTypeSize = 0;
+  std::size_t loopVarTypeSize = 0;
   do {
     Fortran::lower::pft::Evaluation *doLoop =
         &doConstructEval->getFirstNestedEvaluation();
@@ -267,6 +285,9 @@ bool ClauseProcessor::processCollapse(
         &*std::next(doConstructEval->getNestedEvaluations().begin());
   } while (collapseValue > 0);
 
+  convertLoopBounds(converter, currentLocation, lowerBound, upperBound, step,
+                    loopVarTypeSize);
+
   return found;
 }
 
@@ -902,17 +923,39 @@ bool ClauseProcessor::processMap(
 
 bool ClauseProcessor::processReduction(
     mlir::Location currentLocation,
-    llvm::SmallVectorImpl<mlir::Value> &reductionVars,
-    llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
-    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *reductionSymbols)
-    const {
+    llvm::SmallVectorImpl<mlir::Value> &outReductionVars,
+    llvm::SmallVectorImpl<mlir::Type> &outReductionTypes,
+    llvm::SmallVectorImpl<mlir::Attribute> &outReductionDeclSymbols,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
+        *outReductionSymbols) const {
   return findRepeatableClause<omp::clause::Reduction>(
       [&](const omp::clause::Reduction &clause,
           const Fortran::parser::CharBlock &) {
+        // Use local lists of reductions to prevent variables from other
+        // already-processed reduction clauses from impacting this reduction.
+        // For example, the whole `reductionVars` array is queried to decide
+        // whether to do the reduction byref.
+        llvm::SmallVector<mlir::Value> reductionVars;
+        llvm::SmallVector<mlir::Attribute> reductionDeclSymbols;
+        llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSymbols;
         ReductionProcessor rp;
         rp.addDeclareReduction(currentLocation, converter, clause,
                                reductionVars, reductionDeclSymbols,
-                               reductionSymbols);
+                               outReductionSymbols ? &reductionSymbols
+                                                   : nullptr);
+
+        // Copy local lists into the output.
+        llvm::copy(reductionVars, std::back_inserter(outReductionVars));
+        llvm::copy(reductionDeclSymbols,
+                   std::back_inserter(outReductionDeclSymbols));
+        if (outReductionSymbols)
+          llvm::copy(reductionSymbols,
+                     std::back_inserter(*outReductionSymbols));
+
+        outReductionTypes.reserve(outReductionTypes.size() +
+                                  reductionVars.size());
+        llvm::transform(reductionVars, std::back_inserter(outReductionTypes),
+                        [](mlir::Value v) { return v.getType(); });
       });
 }
 
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index ffa8a5e05593..8582716e6b9a 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -46,24 +46,20 @@ namespace omp {
 /// methods that relate to clauses that can impact the lowering of that
 /// construct.
 class ClauseProcessor {
-  using ClauseTy = Fortran::parser::OmpClause;
-
 public:
   ClauseProcessor(Fortran::lower::AbstractConverter &converter,
                   Fortran::semantics::SemanticsContext &semaCtx,
                   const Fortran::parser::OmpClauseList &clauses)
-      : converter(converter), semaCtx(semaCtx), clauses2(clauses),
+      : converter(converter), semaCtx(semaCtx),
         clauses(makeList(clauses, semaCtx)) {}
 
   // 'Unique' clauses: They can appear at most once in the clause list.
-  bool
-  processCollapse(mlir::Location currentLocation,
-                  Fortran::lower::pft::Evaluation &eval,
-                  llvm::SmallVectorImpl<mlir::Value> &lowerBound,
-                  llvm::SmallVectorImpl<mlir::Value> &upperBound,
-                  llvm::SmallVectorImpl<mlir::Value> &step,
-                  llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &iv,
-                  std::size_t &loopVarTypeSize) const;
+  bool processCollapse(
+      mlir::Location currentLocation, Fortran::lower::pft::Evaluation &eval,
+      llvm::SmallVectorImpl<mlir::Value> &lowerBound,
+      llvm::SmallVectorImpl<mlir::Value> &upperBound,
+      llvm::SmallVectorImpl<mlir::Value> &step,
+      llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &iv) const;
   bool processDefault() const;
   bool processDevice(Fortran::lower::StatementContext &stmtCtx,
                      mlir::Value &result) const;
@@ -126,6 +122,7 @@ public:
   bool
   processReduction(mlir::Location currentLocation,
                    llvm::SmallVectorImpl<mlir::Value> &reductionVars,
+                   llvm::SmallVectorImpl<mlir::Type> &reductionTypes,
                    llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
                    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
                        *reductionSymbols = nullptr) const;
@@ -157,7 +154,6 @@ public:
 
 private:
   using ClauseIterator = List<Clause>::const_iterator;
-  using ClauseIterator2 = std::list<ClauseTy>::const_iterator;
 
   /// Utility to find a clause within a range in the clause list.
   template <typename T>
@@ -183,7 +179,6 @@ private:
 
   Fortran::lower::AbstractConverter &converter;
   Fortran::semantics::SemanticsContext &semaCtx;
-  const Fortran::parser::OmpClauseList &clauses2;
   List<Clause> clauses;
 };
 
@@ -239,19 +234,17 @@ bool ClauseProcessor::processMotionClauses(
 template <typename... Ts>
 void ClauseProcessor::processTODO(mlir::Location currentLocation,
                                   llvm::omp::Directive directive) const {
-  auto checkUnhandledClause = [&](const auto *x) {
+  auto checkUnhandledClause = [&](llvm::omp::Clause id, const auto *x) {
     if (!x)
       return;
     TODO(currentLocation,
-         "Unhandled clause " +
-             llvm::StringRef(Fortran::parser::ParseTreeDumper::GetNodeName(*x))
-                 .upper() +
+         "Unhandled clause " + llvm::omp::getOpenMPClauseName(id).upper() +
              " in " + llvm::omp::getOpenMPDirectiveName(directive).upper() +
              " construct");
   };
 
-  for (ClauseIterator2 it = clauses2.v.begin(); it != clauses2.v.end(); ++it)
-    (checkUnhandledClause(std::get_if<Ts>(&it->u)), ...);
+  for (ClauseIterator it = clauses.begin(); it != clauses.end(); ++it)
+    (checkUnhandledClause(it->id, std::get_if<Ts>(&it->u)), ...);
 }
 
 template <typename T>
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index d335129565b4..d91694c4f639 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -214,24 +214,6 @@ static void threadPrivatizeVars(Fortran::lower::AbstractConverter &converter,
   firOpBuilder.restoreInsertionPoint(insPt);
 }
 
-static mlir::Type getLoopVarType(Fortran::lower::AbstractConverter &converter,
-                                 std::size_t loopVarTypeSize) {
-  // OpenMP runtime requires 32-bit or 64-bit loop variables.
-  loopVarTypeSize = loopVarTypeSize * 8;
-  if (loopVarTypeSize < 32) {
-    loopVarTypeSize = 32;
-  } else if (loopVarTypeSize > 64) {
-    loopVarTypeSize = 64;
-    mlir::emitWarning(converter.getCurrentLocation(),
-                      "OpenMP loop iteration variable cannot have more than 64 "
-                      "bits size and will be narrowed into 64 bits.");
-  }
-  assert((loopVarTypeSize == 32 || loopVarTypeSize == 64) &&
-         "OpenMP loop iteration variable size must be transformed into 32-bit "
-         "or 64-bit");
-  return converter.getFirOpBuilder().getIntegerType(loopVarTypeSize);
-}
-
 static mlir::Operation *
 createAndSetPrivatizedLoopVar(Fortran::lower::AbstractConverter &converter,
                               mlir::Location loc, mlir::Value indexVal,
@@ -568,6 +550,7 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
   mlir::omp::ClauseProcBindKindAttr procBindKindAttr;
   llvm::SmallVector<mlir::Value> allocateOperands, allocatorOperands,
       reductionVars;
+  llvm::SmallVector<mlir::Type> reductionTypes;
   llvm::SmallVector<mlir::Attribute> reductionDeclSymbols;
   llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSymbols;
 
@@ -578,13 +561,8 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
   cp.processDefault();
   cp.processAllocate(allocatorOperands, allocateOperands);
   if (!outerCombined)
-    cp.processReduction(currentLocation, reductionVars, reductionDeclSymbols,
-                        &reductionSymbols);
-
-  llvm::SmallVector<mlir::Type> reductionTypes;
-  reductionTypes.reserve(reductionVars.size());
-  llvm::transform(reductionVars, std::back_inserter(reductionTypes),
-                  [](mlir::Value v) { return v.getType(); });
+    cp.processReduction(currentLocation, reductionVars, reductionTypes,
+                        reductionDeclSymbols, &reductionSymbols);
 
   auto reductionCallback = [&](mlir::Operation *op) {
     llvm::SmallVector<mlir::Location> locs(reductionVars.size(),
@@ -756,9 +734,7 @@ genTaskOp(Fortran::lower::AbstractConverter &converter,
   cp.processMergeable(mergeableAttr);
   cp.processPriority(stmtCtx, priorityClauseOperand);
   cp.processDepend(dependTypeOperands, dependOperands);
-  cp.processTODO<Fortran::parser::OmpClause::InReduction,
-                 Fortran::parser::OmpClause::Detach,
-                 Fortran::parser::OmpClause::Affinity>(
+  cp.processTODO<clause::InReduction, clause::Detach, clause::Affinity>(
       currentLocation, llvm::omp::Directive::OMPD_task);
 
   return genOpWithBody<mlir::omp::TaskOp>(
@@ -784,8 +760,8 @@ genTaskgroupOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Value> allocateOperands, allocatorOperands;
   ClauseProcessor cp(converter, semaCtx, clauseList);
   cp.processAllocate(allocatorOperands, allocateOperands);
-  cp.processTODO<Fortran::parser::OmpClause::TaskReduction>(
-      currentLocation, llvm::omp::Directive::OMPD_taskgroup);
+  cp.processTODO<clause::TaskReduction>(currentLocation,
+                                        llvm::omp::Directive::OMPD_taskgroup);
   return genOpWithBody<mlir::omp::TaskgroupOp>(
       OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested)
@@ -1124,16 +1100,11 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
   cp.processMap(currentLocation, directive, stmtCtx, mapOperands, &mapSymTypes,
                 &mapSymLocs, &mapSymbols);
 
-  cp.processTODO<Fortran::parser::OmpClause::Private,
-                 Fortran::parser::OmpClause::Firstprivate,
-                 Fortran::parser::OmpClause::IsDevicePtr,
-                 Fortran::parser::OmpClause::HasDeviceAddr,
-                 Fortran::parser::OmpClause::Reduction,
-                 Fortran::parser::OmpClause::InReduction,
-                 Fortran::parser::OmpClause::Allocate,
-                 Fortran::parser::OmpClause::UsesAllocators,
-                 Fortran::parser::OmpClause::Defaultmap>(
+  cp.processTODO<clause::Private, clause::Firstprivate, clause::IsDevicePtr,
+                 clause::HasDeviceAddr, clause::Reduction, clause::InReduction,
+                 clause::Allocate, clause::UsesAllocators, clause::Defaultmap>(
       currentLocation, llvm::omp::Directive::OMPD_target);
+
   // 5.8.1 Implicit Data-Mapping Attribute Rules
   // The following code follows the implicit data-mapping rules to map all the
   // symbols used inside the region that have not been explicitly mapped using
@@ -1252,8 +1223,8 @@ genTeamsOp(Fortran::lower::AbstractConverter &converter,
   cp.processDefault();
   cp.processNumTeams(stmtCtx, numTeamsClauseOperand);
   cp.processThreadLimit(stmtCtx, threadLimitClauseOperand);
-  cp.processTODO<Fortran::parser::OmpClause::Reduction>(
-      currentLocation, llvm::omp::Directive::OMPD_teams);
+  cp.processTODO<clause::Reduction>(currentLocation,
+                                    llvm::omp::Directive::OMPD_teams);
 
   return genOpWithBody<mlir::omp::TeamsOp>(
       OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
@@ -1305,9 +1276,8 @@ static mlir::omp::DeclareTargetDeviceType getDeclareTargetInfo(
     cp.processEnter(symbolAndClause);
     cp.processLink(symbolAndClause);
     cp.processDeviceType(deviceType);
-    cp.processTODO<Fortran::parser::OmpClause::Indirect>(
-        converter.getCurrentLocation(),
-        llvm::omp::Directive::OMPD_declare_target);
+    cp.processTODO<clause::Indirect>(converter.getCurrentLocation(),
+                                     llvm::omp::Directive::OMPD_declare_target);
   }
 
   return deviceType;
@@ -1389,8 +1359,7 @@ genOmpSimpleStandalone(Fortran::lower::AbstractConverter &converter,
     break;
   case llvm::omp::Directive::OMPD_taskwait:
     ClauseProcessor(converter, semaCtx, opClauseList)
-        .processTODO<Fortran::parser::OmpClause::Depend,
-                     Fortran::parser::OmpClause::Nowait>(
+        .processTODO<clause::Depend, clause::Nowait>(
             currentLocation, llvm::omp::Directive::OMPD_taskwait);
     firOpBuilder.create<mlir::omp::TaskwaitOp>(currentLocation);
     break;
@@ -1465,25 +1434,6 @@ genOMP(Fortran::lower::AbstractConverter &converter,
       standaloneConstruct.u);
 }
 
-static void convertLoopBounds(Fortran::lower::AbstractConverter &converter,
-                              mlir::Location loc,
-                              llvm::SmallVectorImpl<mlir::Value> &lowerBound,
-                              llvm::SmallVectorImpl<mlir::Value> &upperBound,
-                              llvm::SmallVectorImpl<mlir::Value> &step,
-                              std::size_t loopVarTypeSize) {
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  // The types of lower bound, upper bound, and step are converted into the
-  // type of the loop variable if necessary.
-  mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
-  for (unsigned it = 0; it < (unsigned)lowerBound.size(); it++) {
-    lowerBound[it] =
-        firOpBuilder.createConvert(loc, loopVarType, lowerBound[it]);
-    upperBound[it] =
-        firOpBuilder.createConvert(loc, loopVarType, upperBound[it]);
-    step[it] = firOpBuilder.createConvert(loc, loopVarType, step[it]);
-  }
-}
-
 static llvm::SmallVector<const Fortran::semantics::Symbol *>
 genLoopVars(mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
             mlir::Location &loc,
@@ -1517,7 +1467,7 @@ genLoopAndReductionVars(
     mlir::Location &loc,
     llvm::ArrayRef<const Fortran::semantics::Symbol *> loopArgs,
     llvm::ArrayRef<const Fortran::semantics::Symbol *> reductionArgs,
-    llvm::SmallVectorImpl<mlir::Type> &reductionTypes) {
+    llvm::ArrayRef<mlir::Type> reductionTypes) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
   llvm::SmallVector<mlir::Type> blockArgTypes;
@@ -1579,27 +1529,20 @@ createSimdLoop(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Value> lowerBound, upperBound, step, reductionVars;
   llvm::SmallVector<mlir::Value> alignedVars, nontemporalVars;
   llvm::SmallVector<const Fortran::semantics::Symbol *> iv;
+  llvm::SmallVector<mlir::Type> reductionTypes;
   llvm::SmallVector<mlir::Attribute> reductionDeclSymbols;
   mlir::omp::ClauseOrderKindAttr orderClauseOperand;
   mlir::IntegerAttr simdlenClauseOperand, safelenClauseOperand;
-  std::size_t loopVarTypeSize;
 
   ClauseProcessor cp(converter, semaCtx, loopOpClauseList);
-  cp.processCollapse(loc, eval, lowerBound, upperBound, step, iv,
-                     loopVarTypeSize);
+  cp.processCollapse(loc, eval, lowerBound, upperBound, step, iv);
   cp.processScheduleChunk(stmtCtx, scheduleChunkClauseOperand);
-  cp.processReduction(loc, reductionVars, reductionDeclSymbols);
+  cp.processReduction(loc, reductionVars, reductionTypes, reductionDeclSymbols);
   cp.processIf(clause::If::DirectiveNameModifier::Simd, ifClauseOperand);
   cp.processSimdlen(simdlenClauseOperand);
   cp.processSafelen(safelenClauseOperand);
-  cp.processTODO<Fortran::parser::OmpClause::Aligned,
-                 Fortran::parser::OmpClause::Allocate,
-                 Fortran::parser::OmpClause::Linear,
-                 Fortran::parser::OmpClause::Nontemporal,
-                 Fortran::parser::OmpClause::Order>(loc, ompDirective);
-
-  convertLoopBounds(converter, loc, lowerBound, upperBound, step,
-                    loopVarTypeSize);
+  cp.processTODO<clause::Aligned, clause::Allocate, clause::Linear,
+                 clause::Nontemporal, clause::Order>(loc, ompDirective);
 
   mlir::TypeRange resultType;
   auto simdLoopOp = firOpBuilder.create<mlir::omp::SimdLoopOp>(
@@ -1638,6 +1581,7 @@ static void createWsloop(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Value> lowerBound, upperBound, step, reductionVars;
   llvm::SmallVector<mlir::Value> linearVars, linearStepVars;
   llvm::SmallVector<const Fortran::semantics::Symbol *> iv;
+  llvm::SmallVector<mlir::Type> reductionTypes;
   llvm::SmallVector<mlir::Attribute> reductionDeclSymbols;
   llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSymbols;
   mlir::omp::ClauseOrderKindAttr orderClauseOperand;
@@ -1645,19 +1589,13 @@ static void createWsloop(Fortran::lower::AbstractConverter &converter,
   mlir::UnitAttr nowaitClauseOperand, byrefOperand, scheduleSimdClauseOperand;
   mlir::IntegerAttr orderedClauseOperand;
   mlir::omp::ScheduleModifierAttr scheduleModClauseOperand;
-  std::size_t loopVarTypeSize;
 
   ClauseProcessor cp(converter, semaCtx, beginClauseList);
-  cp.processCollapse(loc, eval, lowerBound, upperBound, step, iv,
-                     loopVarTypeSize);
+  cp.processCollapse(loc, eval, lowerBound, upperBound, step, iv);
   cp.processScheduleChunk(stmtCtx, scheduleChunkClauseOperand);
-  cp.processReduction(loc, reductionVars, reductionDeclSymbols,
+  cp.processReduction(loc, reductionVars, reductionTypes, reductionDeclSymbols,
                       &reductionSymbols);
-  cp.processTODO<Fortran::parser::OmpClause::Linear,
-                 Fortran::parser::OmpClause::Order>(loc, ompDirective);
-
-  convertLoopBounds(converter, loc, lowerBound, upperBound, step,
-                    loopVarTypeSize);
+  cp.processTODO<clause::Linear, clause::Order>(loc, ompDirective);
 
   if (ReductionProcessor::doReductionByRef(reductionVars))
     byrefOperand = firOpBuilder.getUnitAttr();
@@ -1699,11 +1637,6 @@ static void createWsloop(Fortran::lower::AbstractConverter &converter,
   auto *nestedEval = getCollapsedLoopEval(
       eval, Fortran::lower::getCollapseValue(beginClauseList));
 
-  llvm::SmallVector<mlir::Type> reductionTypes;
-  reductionTypes.reserve(reductionVars.size());
-  llvm::transform(reductionVars, std::back_inserter(reductionTypes),
-                  [](mlir::Value v) { return v.getType(); });
-
   auto ivCallback = [&](mlir::Operation *op) {
     return genLoopAndReductionVars(op, converter, loc, iv, reductionSymbols,
                                    reductionTypes);
@@ -1724,11 +1657,9 @@ static void createSimdWsloop(
     const Fortran::parser::OmpClauseList &beginClauseList,
     const Fortran::parser::OmpClauseList *endClauseList, mlir::Location loc) {
   ClauseProcessor cp(converter, semaCtx, beginClauseList);
-  cp.processTODO<
-      Fortran::parser::OmpClause::Aligned, Fortran::parser::OmpClause::Allocate,
-      Fortran::parser::OmpClause::Linear, Fortran::parser::OmpClause::Safelen,
-      Fortran::parser::OmpClause::Simdlen, Fortran::parser::OmpClause::Order>(
-      loc, ompDirective);
+  cp.processTODO<clause::Aligned, clause::Allocate, clause::Linear,
+                 clause::Safelen, clause::Simdlen, clause::Order>(loc,
+                                                                  ompDirective);
   // TODO: Add support for vectorization - add vectorization hints inside loop
   // body.
   // OpenMP standard does not specify the length of vector instructions.
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index fa4a51e33848..b9c0660aa4da 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -15,6 +15,7 @@
 
 #include <flang/Lower/AbstractConverter.h>
 #include <flang/Lower/ConvertType.h>
+#include <flang/Optimizer/Builder/FIRBuilder.h>
 #include <flang/Parser/parse-tree.h>
 #include <flang/Parser/tools.h>
 #include <flang/Semantics/tools.h>
@@ -70,6 +71,24 @@ void genObjectList2(const Fortran::parser::OmpObjectList &objectList,
   }
 }
 
+mlir::Type getLoopVarType(Fortran::lower::AbstractConverter &converter,
+                          std::size_t loopVarTypeSize) {
+  // OpenMP runtime requires 32-bit or 64-bit loop variables.
+  loopVarTypeSize = loopVarTypeSize * 8;
+  if (loopVarTypeSize < 32) {
+    loopVarTypeSize = 32;
+  } else if (loopVarTypeSize > 64) {
+    loopVarTypeSize = 64;
+    mlir::emitWarning(converter.getCurrentLocation(),
+                      "OpenMP loop iteration variable cannot have more than 64 "
+                      "bits size and will be narrowed into 64 bits.");
+  }
+  assert((loopVarTypeSize == 32 || loopVarTypeSize == 64) &&
+         "OpenMP loop iteration variable size must be transformed into 32-bit "
+         "or 64-bit");
+  return converter.getFirOpBuilder().getIntegerType(loopVarTypeSize);
+}
+
 void gatherFuncAndVarSyms(
     const ObjectList &objects, mlir::omp::DeclareTargetCaptureClause clause,
     llvm::SmallVectorImpl<DeclareTargetCapturePair> &symbolAndClause) {
diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h
index 3ab0823a4621..4074bf73987d 100644
--- a/flang/lib/Lower/OpenMP/Utils.h
+++ b/flang/lib/Lower/OpenMP/Utils.h
@@ -51,6 +51,9 @@ createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
                 mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy,
                 bool isVal = false);
 
+mlir::Type getLoopVarType(Fortran::lower::AbstractConverter &converter,
+                          std::size_t loopVarTypeSize);
+
 void gatherFuncAndVarSyms(
     const ObjectList &objects, mlir::omp::DeclareTargetCaptureClause clause,
     llvm::SmallVectorImpl<DeclareTargetCapturePair> &symbolAndClause);
diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
index 7dd60b5edcd5..021474871154 100644
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -129,6 +129,7 @@ set(sources
   exceptions.cpp
   execute.cpp
   extensions.cpp
+  external-unit.cpp
   extrema.cpp
   file.cpp
   findloc.cpp
@@ -149,6 +150,7 @@ set(sources
   numeric.cpp
   pointer.cpp
   product.cpp
+  pseudo-unit.cpp
   ragged.cpp
   random.cpp
   reduction.cpp
diff --git a/flang/runtime/external-unit.cpp b/flang/runtime/external-unit.cpp
new file mode 100644
index 000000000000..9d650ceca4a8
--- /dev/null
+++ b/flang/runtime/external-unit.cpp
@@ -0,0 +1,333 @@
+//===-- runtime/external-unit.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implemenation of ExternalFileUnit for RT_USE_PSEUDO_FILE_UNIT=0.
+//
+//===----------------------------------------------------------------------===//
+
+#include "tools.h"
+
+#if !defined(RT_USE_PSEUDO_FILE_UNIT)
+
+#include "io-error.h"
+#include "lock.h"
+#include "unit-map.h"
+#include "unit.h"
+#include <cstdio>
+#include <limits>
+
+namespace Fortran::runtime::io {
+
+// The per-unit data structures are created on demand so that Fortran I/O
+// should work without a Fortran main program.
+static Lock unitMapLock;
+static Lock createOpenLock;
+static UnitMap *unitMap{nullptr};
+
+void FlushOutputOnCrash(const Terminator &terminator) {
+  if (!defaultOutput && !errorOutput) {
+    return;
+  }
+  IoErrorHandler handler{terminator};
+  handler.HasIoStat(); // prevent nested crash if flush has error
+  CriticalSection critical{unitMapLock};
+  if (defaultOutput) {
+    defaultOutput->FlushOutput(handler);
+  }
+  if (errorOutput) {
+    errorOutput->FlushOutput(handler);
+  }
+}
+
+ExternalFileUnit *ExternalFileUnit::LookUp(int unit) {
+  return GetUnitMap().LookUp(unit);
+}
+
+ExternalFileUnit *ExternalFileUnit::LookUpOrCreate(
+    int unit, const Terminator &terminator, bool &wasExtant) {
+  return GetUnitMap().LookUpOrCreate(unit, terminator, wasExtant);
+}
+
+ExternalFileUnit *ExternalFileUnit::LookUpOrCreateAnonymous(int unit,
+    Direction dir, Fortran::common::optional<bool> isUnformatted,
+    const Terminator &terminator) {
+  // Make sure that the returned anonymous unit has been opened
+  // not just created in the unitMap.
+  CriticalSection critical{createOpenLock};
+  bool exists{false};
+  ExternalFileUnit *result{
+      GetUnitMap().LookUpOrCreate(unit, terminator, exists)};
+  if (result && !exists) {
+    IoErrorHandler handler{terminator};
+    result->OpenAnonymousUnit(
+        dir == Direction::Input ? OpenStatus::Unknown : OpenStatus::Replace,
+        Action::ReadWrite, Position::Rewind, Convert::Unknown, handler);
+    result->isUnformatted = isUnformatted;
+  }
+  return result;
+}
+
+ExternalFileUnit *ExternalFileUnit::LookUp(
+    const char *path, std::size_t pathLen) {
+  return GetUnitMap().LookUp(path, pathLen);
+}
+
+ExternalFileUnit &ExternalFileUnit::CreateNew(
+    int unit, const Terminator &terminator) {
+  bool wasExtant{false};
+  ExternalFileUnit *result{
+      GetUnitMap().LookUpOrCreate(unit, terminator, wasExtant)};
+  RUNTIME_CHECK(terminator, result && !wasExtant);
+  return *result;
+}
+
+ExternalFileUnit *ExternalFileUnit::LookUpForClose(int unit) {
+  return GetUnitMap().LookUpForClose(unit);
+}
+
+ExternalFileUnit &ExternalFileUnit::NewUnit(
+    const Terminator &terminator, bool forChildIo) {
+  ExternalFileUnit &unit{GetUnitMap().NewUnit(terminator)};
+  unit.createdForInternalChildIo_ = forChildIo;
+  return unit;
+}
+
+bool ExternalFileUnit::OpenUnit(Fortran::common::optional<OpenStatus> status,
+    Fortran::common::optional<Action> action, Position position,
+    OwningPtr<char> &&newPath, std::size_t newPathLength, Convert convert,
+    IoErrorHandler &handler) {
+  if (convert == Convert::Unknown) {
+    convert = executionEnvironment.conversion;
+  }
+  swapEndianness_ = convert == Convert::Swap ||
+      (convert == Convert::LittleEndian && !isHostLittleEndian) ||
+      (convert == Convert::BigEndian && isHostLittleEndian);
+  bool impliedClose{false};
+  if (IsConnected()) {
+    bool isSamePath{newPath.get() && path() && pathLength() == newPathLength &&
+        std::memcmp(path(), newPath.get(), newPathLength) == 0};
+    if (status && *status != OpenStatus::Old && isSamePath) {
+      handler.SignalError("OPEN statement for connected unit may not have "
+                          "explicit STATUS= other than 'OLD'");
+      return impliedClose;
+    }
+    if (!newPath.get() || isSamePath) {
+      // OPEN of existing unit, STATUS='OLD' or unspecified, not new FILE=
+      newPath.reset();
+      return impliedClose;
+    }
+    // Otherwise, OPEN on open unit with new FILE= implies CLOSE
+    DoImpliedEndfile(handler);
+    FlushOutput(handler);
+    TruncateFrame(0, handler);
+    Close(CloseStatus::Keep, handler);
+    impliedClose = true;
+  }
+  if (newPath.get() && newPathLength > 0) {
+    if (const auto *already{
+            GetUnitMap().LookUp(newPath.get(), newPathLength)}) {
+      handler.SignalError(IostatOpenAlreadyConnected,
+          "OPEN(UNIT=%d,FILE='%.*s'): file is already connected to unit %d",
+          unitNumber_, static_cast<int>(newPathLength), newPath.get(),
+          already->unitNumber_);
+      return impliedClose;
+    }
+  }
+  set_path(std::move(newPath), newPathLength);
+  Open(status.value_or(OpenStatus::Unknown), action, position, handler);
+  auto totalBytes{knownSize()};
+  if (access == Access::Direct) {
+    if (!openRecl) {
+      handler.SignalError(IostatOpenBadRecl,
+          "OPEN(UNIT=%d,ACCESS='DIRECT'): record length is not known",
+          unitNumber());
+    } else if (*openRecl <= 0) {
+      handler.SignalError(IostatOpenBadRecl,
+          "OPEN(UNIT=%d,ACCESS='DIRECT',RECL=%jd): record length is invalid",
+          unitNumber(), static_cast<std::intmax_t>(*openRecl));
+    } else if (totalBytes && (*totalBytes % *openRecl != 0)) {
+      handler.SignalError(IostatOpenBadRecl,
+          "OPEN(UNIT=%d,ACCESS='DIRECT',RECL=%jd): record length is not an "
+          "even divisor of the file size %jd",
+          unitNumber(), static_cast<std::intmax_t>(*openRecl),
+          static_cast<std::intmax_t>(*totalBytes));
+    }
+    recordLength = openRecl;
+  }
+  endfileRecordNumber.reset();
+  currentRecordNumber = 1;
+  if (totalBytes && access == Access::Direct && openRecl.value_or(0) > 0) {
+    endfileRecordNumber = 1 + (*totalBytes / *openRecl);
+  }
+  if (position == Position::Append) {
+    if (totalBytes) {
+      frameOffsetInFile_ = *totalBytes;
+    }
+    if (access != Access::Stream) {
+      if (!endfileRecordNumber) {
+        // Fake it so that we can backspace relative from the end
+        endfileRecordNumber = std::numeric_limits<std::int64_t>::max() - 2;
+      }
+      currentRecordNumber = *endfileRecordNumber;
+    }
+  }
+  return impliedClose;
+}
+
+void ExternalFileUnit::OpenAnonymousUnit(
+    Fortran::common::optional<OpenStatus> status,
+    Fortran::common::optional<Action> action, Position position,
+    Convert convert, IoErrorHandler &handler) {
+  // I/O to an unconnected unit reads/creates a local file, e.g. fort.7
+  std::size_t pathMaxLen{32};
+  auto path{SizedNew<char>{handler}(pathMaxLen)};
+  std::snprintf(path.get(), pathMaxLen, "fort.%d", unitNumber_);
+  OpenUnit(status, action, position, std::move(path), std::strlen(path.get()),
+      convert, handler);
+}
+
+void ExternalFileUnit::CloseUnit(CloseStatus status, IoErrorHandler &handler) {
+  DoImpliedEndfile(handler);
+  FlushOutput(handler);
+  Close(status, handler);
+}
+
+void ExternalFileUnit::DestroyClosed() {
+  GetUnitMap().DestroyClosed(*this); // destroys *this
+}
+
+Iostat ExternalFileUnit::SetDirection(Direction direction) {
+  if (direction == Direction::Input) {
+    if (mayRead()) {
+      direction_ = Direction::Input;
+      return IostatOk;
+    } else {
+      return IostatReadFromWriteOnly;
+    }
+  } else {
+    if (mayWrite()) {
+      direction_ = Direction::Output;
+      return IostatOk;
+    } else {
+      return IostatWriteToReadOnly;
+    }
+  }
+}
+
+UnitMap &ExternalFileUnit::CreateUnitMap() {
+  Terminator terminator{__FILE__, __LINE__};
+  IoErrorHandler handler{terminator};
+  UnitMap &newUnitMap{*New<UnitMap>{terminator}().release()};
+
+  bool wasExtant{false};
+  ExternalFileUnit &out{*newUnitMap.LookUpOrCreate(
+      FORTRAN_DEFAULT_OUTPUT_UNIT, terminator, wasExtant)};
+  RUNTIME_CHECK(terminator, !wasExtant);
+  out.Predefine(1);
+  handler.SignalError(out.SetDirection(Direction::Output));
+  out.isUnformatted = false;
+  defaultOutput = &out;
+
+  ExternalFileUnit &in{*newUnitMap.LookUpOrCreate(
+      FORTRAN_DEFAULT_INPUT_UNIT, terminator, wasExtant)};
+  RUNTIME_CHECK(terminator, !wasExtant);
+  in.Predefine(0);
+  handler.SignalError(in.SetDirection(Direction::Input));
+  in.isUnformatted = false;
+  defaultInput = &in;
+
+  ExternalFileUnit &error{
+      *newUnitMap.LookUpOrCreate(FORTRAN_ERROR_UNIT, terminator, wasExtant)};
+  RUNTIME_CHECK(terminator, !wasExtant);
+  error.Predefine(2);
+  handler.SignalError(error.SetDirection(Direction::Output));
+  error.isUnformatted = false;
+  errorOutput = &error;
+
+  return newUnitMap;
+}
+
+// A back-up atexit() handler for programs that don't terminate with a main
+// program END or a STOP statement or other Fortran-initiated program shutdown,
+// such as programs with a C main() that terminate normally.  It flushes all
+// external I/O units.  It is registered once the first time that any external
+// I/O is attempted.
+static void CloseAllExternalUnits() {
+  IoErrorHandler handler{"Fortran program termination"};
+  ExternalFileUnit::CloseAll(handler);
+}
+
+UnitMap &ExternalFileUnit::GetUnitMap() {
+  if (unitMap) {
+    return *unitMap;
+  }
+  {
+    CriticalSection critical{unitMapLock};
+    if (unitMap) {
+      return *unitMap;
+    }
+    unitMap = &CreateUnitMap();
+  }
+  std::atexit(CloseAllExternalUnits);
+  return *unitMap;
+}
+
+void ExternalFileUnit::CloseAll(IoErrorHandler &handler) {
+  CriticalSection critical{unitMapLock};
+  if (unitMap) {
+    unitMap->CloseAll(handler);
+    FreeMemoryAndNullify(unitMap);
+  }
+  defaultOutput = nullptr;
+  defaultInput = nullptr;
+  errorOutput = nullptr;
+}
+
+void ExternalFileUnit::FlushAll(IoErrorHandler &handler) {
+  CriticalSection critical{unitMapLock};
+  if (unitMap) {
+    unitMap->FlushAll(handler);
+  }
+}
+
+int ExternalFileUnit::GetAsynchronousId(IoErrorHandler &handler) {
+  if (!mayAsynchronous()) {
+    handler.SignalError(IostatBadAsynchronous);
+    return -1;
+  } else {
+    for (int j{0}; 64 * j < maxAsyncIds; ++j) {
+      if (auto least{asyncIdAvailable_[j].LeastElement()}) {
+        asyncIdAvailable_[j].reset(*least);
+        return 64 * j + static_cast<int>(*least);
+      }
+    }
+    handler.SignalError(IostatTooManyAsyncOps);
+    return -1;
+  }
+}
+
+bool ExternalFileUnit::Wait(int id) {
+  if (static_cast<std::size_t>(id) >= maxAsyncIds ||
+      asyncIdAvailable_[id / 64].test(id % 64)) {
+    return false;
+  } else {
+    if (id == 0) { // means "all IDs"
+      for (int j{0}; 64 * j < maxAsyncIds; ++j) {
+        asyncIdAvailable_[j].set();
+      }
+      asyncIdAvailable_[0].reset(0);
+    } else {
+      asyncIdAvailable_[id / 64].set(id % 64);
+    }
+    return true;
+  }
+}
+
+} // namespace Fortran::runtime::io
+
+#endif // !defined(RT_USE_PSEUDO_FILE_UNIT)
diff --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp
index 075d7b5ae518..e3f1214324d8 100644
--- a/flang/runtime/io-stmt.cpp
+++ b/flang/runtime/io-stmt.cpp
@@ -227,7 +227,17 @@ ConnectionState &ExternalIoStatementBase::GetConnectionState() { return unit_; }
 int ExternalIoStatementBase::EndIoStatement() {
   CompleteOperation();
   auto result{IoStatementBase::EndIoStatement()};
+#if !defined(RT_USE_PSEUDO_FILE_UNIT)
   unit_.EndIoStatement(); // annihilates *this in unit_.u_
+#else
+  // Fetch the unit pointer before *this disappears.
+  ExternalFileUnit *unitPtr{&unit_};
+  // The pseudo file units are dynamically allocated
+  // and are not tracked in the unit map.
+  // They have to be destructed and deallocated here.
+  unitPtr->~ExternalFileUnit();
+  FreeMemory(unitPtr);
+#endif
   return result;
 }
 
diff --git a/flang/runtime/lock.h b/flang/runtime/lock.h
index 5fdcf4745c21..61b06a62ff7c 100644
--- a/flang/runtime/lock.h
+++ b/flang/runtime/lock.h
@@ -12,6 +12,7 @@
 #define FORTRAN_RUNTIME_LOCK_H_
 
 #include "terminator.h"
+#include "tools.h"
 
 // Avoid <mutex> if possible to avoid introduction of C++ runtime
 // library dependence.
@@ -35,7 +36,17 @@ namespace Fortran::runtime {
 
 class Lock {
 public:
-#if USE_PTHREADS
+#if RT_USE_PSEUDO_LOCK
+  // No lock implementation, e.g. for using together
+  // with RT_USE_PSEUDO_FILE_UNIT.
+  // The users of Lock class may use it under
+  // USE_PTHREADS and otherwise, so it has to provide
+  // all the interfaces.
+  void Take() {}
+  bool Try() { return true; }
+  void Drop() {}
+  bool TakeIfNoDeadlock() { return true; }
+#elif USE_PTHREADS
   Lock() { pthread_mutex_init(&mutex_, nullptr); }
   ~Lock() { pthread_mutex_destroy(&mutex_); }
   void Take() {
@@ -79,7 +90,9 @@ public:
   }
 
 private:
-#if USE_PTHREADS
+#if RT_USE_PSEUDO_FILE_UNIT
+  // No state.
+#elif USE_PTHREADS
   pthread_mutex_t mutex_{};
   volatile bool isBusy_{false};
   volatile pthread_t holder_;
diff --git a/flang/runtime/pseudo-unit.cpp b/flang/runtime/pseudo-unit.cpp
new file mode 100644
index 000000000000..8b5f36e2233a
--- /dev/null
+++ b/flang/runtime/pseudo-unit.cpp
@@ -0,0 +1,167 @@
+//===-- runtime/pseudo-unit.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implemenation of ExternalFileUnit and PseudoOpenFile for
+// RT_USE_PSEUDO_FILE_UNIT=1.
+//
+//===----------------------------------------------------------------------===//
+
+#include "tools.h"
+
+#if defined(RT_USE_PSEUDO_FILE_UNIT)
+
+#include "io-error.h"
+#include "unit.h"
+#include <cstdio>
+
+namespace Fortran::runtime::io {
+
+void FlushOutputOnCrash(const Terminator &) {}
+
+ExternalFileUnit *ExternalFileUnit::LookUp(int) {
+  Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+
+ExternalFileUnit *ExternalFileUnit::LookUpOrCreate(
+    int, const Terminator &, bool &) {
+  Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+
+ExternalFileUnit *ExternalFileUnit::LookUpOrCreateAnonymous(int unit,
+    Direction direction, Fortran::common::optional<bool>,
+    const Terminator &terminator) {
+  if (direction != Direction::Output) {
+    terminator.Crash("ExternalFileUnit only supports output IO");
+  }
+  return New<ExternalFileUnit>{terminator}(unit).release();
+}
+
+ExternalFileUnit *ExternalFileUnit::LookUp(const char *, std::size_t) {
+  Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+
+ExternalFileUnit &ExternalFileUnit::CreateNew(int, const Terminator &) {
+  Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+
+ExternalFileUnit *ExternalFileUnit::LookUpForClose(int) {
+  Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+
+ExternalFileUnit &ExternalFileUnit::NewUnit(const Terminator &, bool) {
+  Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+
+bool ExternalFileUnit::OpenUnit(Fortran::common::optional<OpenStatus> status,
+    Fortran::common::optional<Action>, Position, OwningPtr<char> &&,
+    std::size_t, Convert, IoErrorHandler &handler) {
+  handler.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+
+void ExternalFileUnit::OpenAnonymousUnit(Fortran::common::optional<OpenStatus>,
+    Fortran::common::optional<Action>, Position, Convert convert,
+    IoErrorHandler &handler) {
+  handler.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+
+void ExternalFileUnit::CloseUnit(CloseStatus, IoErrorHandler &handler) {
+  handler.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+
+void ExternalFileUnit::DestroyClosed() {
+  Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+
+Iostat ExternalFileUnit::SetDirection(Direction direction) {
+  if (direction != Direction::Output) {
+    return IostatReadFromWriteOnly;
+  }
+  direction_ = direction;
+  return IostatOk;
+}
+
+void ExternalFileUnit::CloseAll(IoErrorHandler &) {}
+
+void ExternalFileUnit::FlushAll(IoErrorHandler &) {}
+
+int ExternalFileUnit::GetAsynchronousId(IoErrorHandler &handler) {
+  handler.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+
+bool ExternalFileUnit::Wait(int) {
+  Terminator{__FILE__, __LINE__}.Crash("unsupported");
+}
+
+void PseudoOpenFile::set_mayAsynchronous(bool yes) {
+  if (yes) {
+    Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+  }
+}
+
+Fortran::common::optional<PseudoOpenFile::FileOffset>
+PseudoOpenFile::knownSize() const {
+  Terminator{__FILE__, __LINE__}.Crash("unsupported");
+}
+
+void PseudoOpenFile::Open(OpenStatus, Fortran::common::optional<Action>,
+    Position, IoErrorHandler &handler) {
+  handler.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+
+void PseudoOpenFile::Close(CloseStatus, IoErrorHandler &handler) {
+  handler.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+
+std::size_t PseudoOpenFile::Read(
+    FileOffset, char *, std::size_t, std::size_t, IoErrorHandler &handler) {
+  handler.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+
+std::size_t PseudoOpenFile::Write(FileOffset at, const char *buffer,
+    std::size_t bytes, IoErrorHandler &handler) {
+  if (at) {
+    handler.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+  }
+  // TODO: use persistent string buffer that can be reallocated
+  // as needed, and only freed at destruction of *this.
+  auto string{SizedNew<char>{handler}(bytes + 1)};
+  std::memcpy(string.get(), buffer, bytes);
+  string.get()[bytes] = '\0';
+  std::printf("%s", string.get());
+  return bytes;
+}
+
+void PseudoOpenFile::Truncate(FileOffset, IoErrorHandler &handler) {
+  handler.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+
+int PseudoOpenFile::ReadAsynchronously(
+    FileOffset, char *, std::size_t, IoErrorHandler &handler) {
+  handler.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+
+int PseudoOpenFile::WriteAsynchronously(
+    FileOffset, const char *, std::size_t, IoErrorHandler &handler) {
+  handler.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+
+void PseudoOpenFile::Wait(int, IoErrorHandler &handler) {
+  handler.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+
+void PseudoOpenFile::WaitAll(IoErrorHandler &handler) {
+  handler.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+
+Position PseudoOpenFile::InquirePosition() const {
+  Terminator{__FILE__, __LINE__}.Crash("%s: unsupported", RT_PRETTY_FUNCTION);
+}
+
+} // namespace Fortran::runtime::io
+
+#endif // defined(RT_USE_PSEUDO_FILE_UNIT)
diff --git a/flang/runtime/tools.h b/flang/runtime/tools.h
index df25eb888233..c70a1b438e33 100644
--- a/flang/runtime/tools.h
+++ b/flang/runtime/tools.h
@@ -21,6 +21,27 @@
 #include <map>
 #include <type_traits>
 
+/// \macro RT_PRETTY_FUNCTION
+/// Gets a user-friendly looking function signature for the current scope
+/// using the best available method on each platform.  The exact format of the
+/// resulting string is implementation specific and non-portable, so this should
+/// only be used, for example, for logging or diagnostics.
+/// Copy of LLVM_PRETTY_FUNCTION
+#if defined(_MSC_VER)
+#define RT_PRETTY_FUNCTION __FUNCSIG__
+#elif defined(__GNUC__) || defined(__clang__)
+#define RT_PRETTY_FUNCTION __PRETTY_FUNCTION__
+#else
+#define RT_PRETTY_FUNCTION __func__
+#endif
+
+#if defined(RT_DEVICE_COMPILATION)
+// Use the pseudo lock and pseudo file unit implementations
+// for the device.
+#define RT_USE_PSEUDO_LOCK 1
+#define RT_USE_PSEUDO_FILE_UNIT 1
+#endif
+
 namespace Fortran::runtime {
 
 class Terminator;
diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp
index 82f0e68cc20a..67f4775ae0a9 100644
--- a/flang/runtime/unit.cpp
+++ b/flang/runtime/unit.cpp
@@ -5,293 +5,23 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-
+//
+// Implementation of ExternalFileUnit common for both
+// RT_USE_PSEUDO_FILE_UNIT=0 and RT_USE_PSEUDO_FILE_UNIT=1.
+//
+//===----------------------------------------------------------------------===//
 #include "unit.h"
 #include "io-error.h"
 #include "lock.h"
 #include "tools.h"
-#include "unit-map.h"
-#include "flang/Runtime/magic-numbers.h"
-#include <cstdio>
 #include <limits>
 #include <utility>
 
 namespace Fortran::runtime::io {
 
-// The per-unit data structures are created on demand so that Fortran I/O
-// should work without a Fortran main program.
-static Lock unitMapLock;
-static Lock createOpenLock;
-static UnitMap *unitMap{nullptr};
-static ExternalFileUnit *defaultInput{nullptr}; // unit 5
-static ExternalFileUnit *defaultOutput{nullptr}; // unit 6
-static ExternalFileUnit *errorOutput{nullptr}; // unit 0 extension
-
-void FlushOutputOnCrash(const Terminator &terminator) {
-  if (!defaultOutput && !errorOutput) {
-    return;
-  }
-  IoErrorHandler handler{terminator};
-  handler.HasIoStat(); // prevent nested crash if flush has error
-  CriticalSection critical{unitMapLock};
-  if (defaultOutput) {
-    defaultOutput->FlushOutput(handler);
-  }
-  if (errorOutput) {
-    errorOutput->FlushOutput(handler);
-  }
-}
-
-ExternalFileUnit *ExternalFileUnit::LookUp(int unit) {
-  return GetUnitMap().LookUp(unit);
-}
-
-ExternalFileUnit *ExternalFileUnit::LookUpOrCreate(
-    int unit, const Terminator &terminator, bool &wasExtant) {
-  return GetUnitMap().LookUpOrCreate(unit, terminator, wasExtant);
-}
-
-ExternalFileUnit *ExternalFileUnit::LookUpOrCreateAnonymous(int unit,
-    Direction dir, Fortran::common::optional<bool> isUnformatted,
-    const Terminator &terminator) {
-  // Make sure that the returned anonymous unit has been opened
-  // not just created in the unitMap.
-  CriticalSection critical{createOpenLock};
-  bool exists{false};
-  ExternalFileUnit *result{
-      GetUnitMap().LookUpOrCreate(unit, terminator, exists)};
-  if (result && !exists) {
-    IoErrorHandler handler{terminator};
-    result->OpenAnonymousUnit(
-        dir == Direction::Input ? OpenStatus::Unknown : OpenStatus::Replace,
-        Action::ReadWrite, Position::Rewind, Convert::Unknown, handler);
-    result->isUnformatted = isUnformatted;
-  }
-  return result;
-}
-
-ExternalFileUnit *ExternalFileUnit::LookUp(
-    const char *path, std::size_t pathLen) {
-  return GetUnitMap().LookUp(path, pathLen);
-}
-
-ExternalFileUnit &ExternalFileUnit::CreateNew(
-    int unit, const Terminator &terminator) {
-  bool wasExtant{false};
-  ExternalFileUnit *result{
-      GetUnitMap().LookUpOrCreate(unit, terminator, wasExtant)};
-  RUNTIME_CHECK(terminator, result && !wasExtant);
-  return *result;
-}
-
-ExternalFileUnit *ExternalFileUnit::LookUpForClose(int unit) {
-  return GetUnitMap().LookUpForClose(unit);
-}
-
-ExternalFileUnit &ExternalFileUnit::NewUnit(
-    const Terminator &terminator, bool forChildIo) {
-  ExternalFileUnit &unit{GetUnitMap().NewUnit(terminator)};
-  unit.createdForInternalChildIo_ = forChildIo;
-  return unit;
-}
-
-bool ExternalFileUnit::OpenUnit(Fortran::common::optional<OpenStatus> status,
-    Fortran::common::optional<Action> action, Position position,
-    OwningPtr<char> &&newPath, std::size_t newPathLength, Convert convert,
-    IoErrorHandler &handler) {
-  if (convert == Convert::Unknown) {
-    convert = executionEnvironment.conversion;
-  }
-  swapEndianness_ = convert == Convert::Swap ||
-      (convert == Convert::LittleEndian && !isHostLittleEndian) ||
-      (convert == Convert::BigEndian && isHostLittleEndian);
-  bool impliedClose{false};
-  if (IsConnected()) {
-    bool isSamePath{newPath.get() && path() && pathLength() == newPathLength &&
-        std::memcmp(path(), newPath.get(), newPathLength) == 0};
-    if (status && *status != OpenStatus::Old && isSamePath) {
-      handler.SignalError("OPEN statement for connected unit may not have "
-                          "explicit STATUS= other than 'OLD'");
-      return impliedClose;
-    }
-    if (!newPath.get() || isSamePath) {
-      // OPEN of existing unit, STATUS='OLD' or unspecified, not new FILE=
-      newPath.reset();
-      return impliedClose;
-    }
-    // Otherwise, OPEN on open unit with new FILE= implies CLOSE
-    DoImpliedEndfile(handler);
-    FlushOutput(handler);
-    TruncateFrame(0, handler);
-    Close(CloseStatus::Keep, handler);
-    impliedClose = true;
-  }
-  if (newPath.get() && newPathLength > 0) {
-    if (const auto *already{
-            GetUnitMap().LookUp(newPath.get(), newPathLength)}) {
-      handler.SignalError(IostatOpenAlreadyConnected,
-          "OPEN(UNIT=%d,FILE='%.*s'): file is already connected to unit %d",
-          unitNumber_, static_cast<int>(newPathLength), newPath.get(),
-          already->unitNumber_);
-      return impliedClose;
-    }
-  }
-  set_path(std::move(newPath), newPathLength);
-  Open(status.value_or(OpenStatus::Unknown), action, position, handler);
-  auto totalBytes{knownSize()};
-  if (access == Access::Direct) {
-    if (!openRecl) {
-      handler.SignalError(IostatOpenBadRecl,
-          "OPEN(UNIT=%d,ACCESS='DIRECT'): record length is not known",
-          unitNumber());
-    } else if (*openRecl <= 0) {
-      handler.SignalError(IostatOpenBadRecl,
-          "OPEN(UNIT=%d,ACCESS='DIRECT',RECL=%jd): record length is invalid",
-          unitNumber(), static_cast<std::intmax_t>(*openRecl));
-    } else if (totalBytes && (*totalBytes % *openRecl != 0)) {
-      handler.SignalError(IostatOpenBadRecl,
-          "OPEN(UNIT=%d,ACCESS='DIRECT',RECL=%jd): record length is not an "
-          "even divisor of the file size %jd",
-          unitNumber(), static_cast<std::intmax_t>(*openRecl),
-          static_cast<std::intmax_t>(*totalBytes));
-    }
-    recordLength = openRecl;
-  }
-  endfileRecordNumber.reset();
-  currentRecordNumber = 1;
-  if (totalBytes && access == Access::Direct && openRecl.value_or(0) > 0) {
-    endfileRecordNumber = 1 + (*totalBytes / *openRecl);
-  }
-  if (position == Position::Append) {
-    if (totalBytes) {
-      frameOffsetInFile_ = *totalBytes;
-    }
-    if (access != Access::Stream) {
-      if (!endfileRecordNumber) {
-        // Fake it so that we can backspace relative from the end
-        endfileRecordNumber = std::numeric_limits<std::int64_t>::max() - 2;
-      }
-      currentRecordNumber = *endfileRecordNumber;
-    }
-  }
-  return impliedClose;
-}
-
-void ExternalFileUnit::OpenAnonymousUnit(
-    Fortran::common::optional<OpenStatus> status,
-    Fortran::common::optional<Action> action, Position position,
-    Convert convert, IoErrorHandler &handler) {
-  // I/O to an unconnected unit reads/creates a local file, e.g. fort.7
-  std::size_t pathMaxLen{32};
-  auto path{SizedNew<char>{handler}(pathMaxLen)};
-  std::snprintf(path.get(), pathMaxLen, "fort.%d", unitNumber_);
-  OpenUnit(status, action, position, std::move(path), std::strlen(path.get()),
-      convert, handler);
-}
-
-void ExternalFileUnit::CloseUnit(CloseStatus status, IoErrorHandler &handler) {
-  DoImpliedEndfile(handler);
-  FlushOutput(handler);
-  Close(status, handler);
-}
-
-void ExternalFileUnit::DestroyClosed() {
-  GetUnitMap().DestroyClosed(*this); // destroys *this
-}
-
-Iostat ExternalFileUnit::SetDirection(Direction direction) {
-  if (direction == Direction::Input) {
-    if (mayRead()) {
-      direction_ = Direction::Input;
-      return IostatOk;
-    } else {
-      return IostatReadFromWriteOnly;
-    }
-  } else {
-    if (mayWrite()) {
-      direction_ = Direction::Output;
-      return IostatOk;
-    } else {
-      return IostatWriteToReadOnly;
-    }
-  }
-}
-
-UnitMap &ExternalFileUnit::CreateUnitMap() {
-  Terminator terminator{__FILE__, __LINE__};
-  IoErrorHandler handler{terminator};
-  UnitMap &newUnitMap{*New<UnitMap>{terminator}().release()};
-
-  bool wasExtant{false};
-  ExternalFileUnit &out{*newUnitMap.LookUpOrCreate(
-      FORTRAN_DEFAULT_OUTPUT_UNIT, terminator, wasExtant)};
-  RUNTIME_CHECK(terminator, !wasExtant);
-  out.Predefine(1);
-  handler.SignalError(out.SetDirection(Direction::Output));
-  out.isUnformatted = false;
-  defaultOutput = &out;
-
-  ExternalFileUnit &in{*newUnitMap.LookUpOrCreate(
-      FORTRAN_DEFAULT_INPUT_UNIT, terminator, wasExtant)};
-  RUNTIME_CHECK(terminator, !wasExtant);
-  in.Predefine(0);
-  handler.SignalError(in.SetDirection(Direction::Input));
-  in.isUnformatted = false;
-  defaultInput = &in;
-
-  ExternalFileUnit &error{
-      *newUnitMap.LookUpOrCreate(FORTRAN_ERROR_UNIT, terminator, wasExtant)};
-  RUNTIME_CHECK(terminator, !wasExtant);
-  error.Predefine(2);
-  handler.SignalError(error.SetDirection(Direction::Output));
-  error.isUnformatted = false;
-  errorOutput = &error;
-
-  return newUnitMap;
-}
-
-// A back-up atexit() handler for programs that don't terminate with a main
-// program END or a STOP statement or other Fortran-initiated program shutdown,
-// such as programs with a C main() that terminate normally.  It flushes all
-// external I/O units.  It is registered once the first time that any external
-// I/O is attempted.
-static void CloseAllExternalUnits() {
-  IoErrorHandler handler{"Fortran program termination"};
-  ExternalFileUnit::CloseAll(handler);
-}
-
-UnitMap &ExternalFileUnit::GetUnitMap() {
-  if (unitMap) {
-    return *unitMap;
-  }
-  {
-    CriticalSection critical{unitMapLock};
-    if (unitMap) {
-      return *unitMap;
-    }
-    unitMap = &CreateUnitMap();
-  }
-  std::atexit(CloseAllExternalUnits);
-  return *unitMap;
-}
-
-void ExternalFileUnit::CloseAll(IoErrorHandler &handler) {
-  CriticalSection critical{unitMapLock};
-  if (unitMap) {
-    unitMap->CloseAll(handler);
-    FreeMemoryAndNullify(unitMap);
-  }
-  defaultOutput = nullptr;
-  defaultInput = nullptr;
-  errorOutput = nullptr;
-}
-
-void ExternalFileUnit::FlushAll(IoErrorHandler &handler) {
-  CriticalSection critical{unitMapLock};
-  if (unitMap) {
-    unitMap->FlushAll(handler);
-  }
-}
+ExternalFileUnit *defaultInput{nullptr}; // unit 5
+ExternalFileUnit *defaultOutput{nullptr}; // unit 6
+ExternalFileUnit *errorOutput{nullptr}; // unit 0 extension
 
 static inline void SwapEndianness(
     char *data, std::size_t bytes, std::size_t elementBytes) {
@@ -999,39 +729,6 @@ void ExternalFileUnit::PopChildIo(ChildIo &child) {
   child_.reset(child.AcquirePrevious().release()); // deletes top child
 }
 
-int ExternalFileUnit::GetAsynchronousId(IoErrorHandler &handler) {
-  if (!mayAsynchronous()) {
-    handler.SignalError(IostatBadAsynchronous);
-    return -1;
-  } else {
-    for (int j{0}; 64 * j < maxAsyncIds; ++j) {
-      if (auto least{asyncIdAvailable_[j].LeastElement()}) {
-        asyncIdAvailable_[j].reset(*least);
-        return 64 * j + static_cast<int>(*least);
-      }
-    }
-    handler.SignalError(IostatTooManyAsyncOps);
-    return -1;
-  }
-}
-
-bool ExternalFileUnit::Wait(int id) {
-  if (static_cast<std::size_t>(id) >= maxAsyncIds ||
-      asyncIdAvailable_[id / 64].test(id % 64)) {
-    return false;
-  } else {
-    if (id == 0) { // means "all IDs"
-      for (int j{0}; 64 * j < maxAsyncIds; ++j) {
-        asyncIdAvailable_[j].set();
-      }
-      asyncIdAvailable_[0].reset(0);
-    } else {
-      asyncIdAvailable_[id / 64].set(id % 64);
-    }
-    return true;
-  }
-}
-
 std::int32_t ExternalFileUnit::ReadHeaderOrFooter(std::int64_t frameOffset) {
   std::int32_t word;
   char *wordPtr{reinterpret_cast<char *>(&word)};
diff --git a/flang/runtime/unit.h b/flang/runtime/unit.h
index fc5bead7e1d9..5f854abd42f6 100644
--- a/flang/runtime/unit.h
+++ b/flang/runtime/unit.h
@@ -31,10 +31,67 @@ namespace Fortran::runtime::io {
 
 class UnitMap;
 class ChildIo;
+class ExternalFileUnit;
+
+// Predefined file units.
+extern ExternalFileUnit *defaultInput; // unit 5
+extern ExternalFileUnit *defaultOutput; // unit 6
+extern ExternalFileUnit *errorOutput; // unit 0 extension
+
+#if defined(RT_USE_PSEUDO_FILE_UNIT)
+// A flavor of OpenFile class that pretends to be a terminal,
+// and only provides basic buffering of the output
+// in an internal buffer, and Write's the output
+// using std::printf(). Since it does not rely on file system
+// APIs, it can be used to implement external output
+// for offload devices.
+class PseudoOpenFile {
+public:
+  using FileOffset = std::int64_t;
+
+  const char *path() const { return nullptr; }
+  std::size_t pathLength() const { return 0; }
+  void set_path(OwningPtr<char> &&, std::size_t bytes) {}
+  bool mayRead() const { return false; }
+  bool mayWrite() const { return true; }
+  bool mayPosition() const { return false; }
+  bool mayAsynchronous() const { return false; }
+  void set_mayAsynchronous(bool yes);
+  // Pretend to be a terminal to force the output
+  // at the end of IO statement.
+  bool isTerminal() const { return true; }
+  bool isWindowsTextFile() const { return false; }
+  Fortran::common::optional<FileOffset> knownSize() const;
+  bool IsConnected() const { return false; }
+  void Open(OpenStatus, Fortran::common::optional<Action>, Position,
+      IoErrorHandler &);
+  void Predefine(int fd) {}
+  void Close(CloseStatus, IoErrorHandler &);
+  std::size_t Read(FileOffset, char *, std::size_t minBytes,
+      std::size_t maxBytes, IoErrorHandler &);
+  std::size_t Write(FileOffset, const char *, std::size_t, IoErrorHandler &);
+  void Truncate(FileOffset, IoErrorHandler &);
+  int ReadAsynchronously(FileOffset, char *, std::size_t, IoErrorHandler &);
+  int WriteAsynchronously(
+      FileOffset, const char *, std::size_t, IoErrorHandler &);
+  void Wait(int id, IoErrorHandler &);
+  void WaitAll(IoErrorHandler &);
+  Position InquirePosition() const;
+};
+#endif // defined(RT_USE_PSEUDO_FILE_UNIT)
+
+#if !defined(RT_USE_PSEUDO_FILE_UNIT)
+using OpenFileClass = OpenFile;
+using FileFrameClass = FileFrame<ExternalFileUnit>;
+#else // defined(RT_USE_PSEUDO_FILE_UNIT)
+using OpenFileClass = PseudoOpenFile;
+// Use not so big buffer for the pseudo file unit frame.
+using FileFrameClass = FileFrame<ExternalFileUnit, 1024>;
+#endif // defined(RT_USE_PSEUDO_FILE_UNIT)
 
 class ExternalFileUnit : public ConnectionState,
-                         public OpenFile,
-                         public FileFrame<ExternalFileUnit> {
+                         public OpenFileClass,
+                         public FileFrameClass {
 public:
   static constexpr int maxAsyncIds{64 * 16};
 
diff --git a/flang/test/Lower/HLFIR/procedure-pointer-component-structure-constructor.f90 b/flang/test/Lower/HLFIR/procedure-pointer-component-structure-constructor.f90
new file mode 100644
index 000000000000..f41c832ee5ec
--- /dev/null
+++ b/flang/test/Lower/HLFIR/procedure-pointer-component-structure-constructor.f90
@@ -0,0 +1,48 @@
+! Test passing
+!  1. NULL(),
+!  2. procedure,
+!  3. procedure pointer, (pending)
+!  4. reference to a function that returns a procedure pointer (pending)
+! to a derived type structure constructor.
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+
+  MODULE M
+    TYPE :: DT
+      PROCEDURE(Fun), POINTER, NOPASS :: pp1
+    END TYPE
+
+    CONTAINS
+
+    INTEGER FUNCTION Fun(Arg)
+    INTEGER :: Arg
+      Fun = Arg
+    END FUNCTION
+
+  END MODULE
+
+  PROGRAM MAIN
+  USE M
+  IMPLICIT NONE
+  TYPE (DT), PARAMETER :: v1 = DT(NULL())
+  TYPE (DT) :: v2
+  v2 = DT(fun)
+  END
+
+! CDHECK-LABEL:  fir.global internal @_QFECv1 constant : !fir.type<_QMmTdt{pp1:!fir.boxproc<(!fir.ref<i32>) -> i32>}> {
+! CHECK:    %[[VAL_0:.*]] = fir.undefined !fir.type<_QMmTdt{pp1:!fir.boxproc<(!fir.ref<i32>) -> i32>}>
+! CHECK:    %[[VAL_1:.*]] = fir.field_index pp1, !fir.type<_QMmTdt{pp1:!fir.boxproc<(!fir.ref<i32>) -> i32>}>
+! CHECK:    %[[VAL_2:.*]] = fir.zero_bits (!fir.ref<i32>) -> i32
+! CHECK:    %[[VAL_3:.*]] = fir.emboxproc %[[VAL_2]] : ((!fir.ref<i32>) -> i32) -> !fir.boxproc<(!fir.ref<i32>) -> i32>
+! CHECK:    %[[VAL_4:.*]] = fir.insert_value %[[VAL_0]], %[[VAL_3]], ["pp1", !fir.type<_QMmTdt{pp1:!fir.boxproc<(!fir.ref<i32>) -> i32>}>] : (!fir.type<_QMmTdt{pp1:!fir.boxproc<(!fir.ref<i32>) -> i32>}>, !fir.boxproc<(!fir.ref<i32>) -> i32>) -> !fir.type<_QMmTdt{pp1:!fir.boxproc<(!fir.ref<i32>) -> i32>}>
+! CHECK:    fir.has_value %[[VAL_4]] : !fir.type<_QMmTdt{pp1:!fir.boxproc<(!fir.ref<i32>) -> i32>}>
+! CHECK:  }
+
+! CHECK-LABEL:  fir.global internal @_QQro._QMmTdt.0 constant : !fir.type<_QMmTdt{pp1:!fir.boxproc<(!fir.ref<i32>) -> i32>}> {
+! CHECK:    %[[VAL_0:.*]] = fir.undefined !fir.type<_QMmTdt{pp1:!fir.boxproc<(!fir.ref<i32>) -> i32>}>
+! CHECK:    %[[VAL_1:.*]] = fir.field_index pp1, !fir.type<_QMmTdt{pp1:!fir.boxproc<(!fir.ref<i32>) -> i32>}>
+! CHECK:    %[[VAL_2:.*]] = fir.address_of(@_QMmPfun) : (!fir.ref<i32>) -> i32
+! CHECK:    %[[VAL_3:.*]] = fir.emboxproc %[[VAL_2]] : ((!fir.ref<i32>) -> i32) -> !fir.boxproc<() -> ()>
+! CHECK:    %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.boxproc<() -> ()>) -> !fir.boxproc<(!fir.ref<i32>) -> i32>
+! CHECK:    %[[VAL_5:.*]] = fir.insert_value %[[VAL_0]], %[[VAL_4]], ["pp1", !fir.type<_QMmTdt{pp1:!fir.boxproc<(!fir.ref<i32>) -> i32>}>] : (!fir.type<_QMmTdt{pp1:!fir.boxproc<(!fir.ref<i32>) -> i32>}>, !fir.boxproc<(!fir.ref<i32>) -> i32>) -> !fir.type<_QMmTdt{pp1:!fir.boxproc<(!fir.ref<i32>) -> i32>}>
+! CHECK:    fir.has_value %[[VAL_5]] : !fir.type<_QMmTdt{pp1:!fir.boxproc<(!fir.ref<i32>) -> i32>}>
+! CHECK:  }
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-multi.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-multi.f90
new file mode 100644
index 000000000000..9e9951c399c9
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-multi.f90
@@ -0,0 +1,81 @@
+! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+!CHECK-LABEL: omp.declare_reduction
+!CHECK-SAME: @[[MIN_RED_I32_NAME:.*]] : i32 init {
+!CHECK: ^bb0(%{{.*}}: i32):
+!CHECK:  %[[C0_1:.*]] = arith.constant 2147483647 : i32
+!CHECK:  omp.yield(%[[C0_1]] : i32)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32):
+!CHECK:  %[[RES:.*]] = arith.minsi %[[ARG0]], %[[ARG1]] : i32
+!CHECK:  omp.yield(%[[RES]] : i32)
+!CHECK: }
+
+!CHECK-LABEL: omp.declare_reduction
+!CHECK-SAME: @[[ADD_RED_F32_NAME:.*]] : f32 init {
+!CHECK: ^bb0(%{{.*}}: f32):
+!CHECK:   %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32
+!CHECK:   omp.yield(%[[C0_1]] : f32)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32):
+!CHECK:   %[[RES:.*]] = arith.addf %[[ARG0]], %[[ARG1]] {{.*}} : f32
+!CHECK:   omp.yield(%[[RES]] : f32)
+!CHECK: }
+
+!CHECK-LABEL: omp.declare_reduction
+!CHECK-SAME: @[[ADD_RED_I32_NAME:.*]] : i32 init {
+!CHECK: ^bb0(%{{.*}}: i32):
+!CHECK:  %[[C0_1:.*]] = arith.constant 0 : i32
+!CHECK:  omp.yield(%[[C0_1]] : i32)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32):
+!CHECK:  %[[RES:.*]] = arith.addi %[[ARG0]], %[[ARG1]] : i32
+!CHECK:  omp.yield(%[[RES]] : i32)
+!CHECK: }
+
+!CHECK-LABEL: func.func @_QPmultiple_reduction
+!CHECK:  %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_reductionEx"}
+!CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {uniq_name = "_QFmultiple_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[Y_REF:.*]] = fir.alloca f32 {bindc_name = "y", uniq_name = "_QFmultiple_reductionEy"}
+!CHECK:  %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_REF]] {uniq_name = "_QFmultiple_reductionEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:  %[[Z_REF:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFmultiple_reductionEz"}
+!CHECK:  %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z_REF]] {uniq_name = "_QFmultiple_reductionEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  omp.wsloop reduction(
+!CHECK-SAME: @[[ADD_RED_I32_NAME]] %[[X_DECL]]#0 -> %[[PRV_X:.+]] : !fir.ref<i32>,
+!CHECK-SAME: @[[ADD_RED_F32_NAME]] %[[Y_DECL]]#0 -> %[[PRV_Y:.+]] : !fir.ref<f32>,
+!CHECK-SAME: @[[MIN_RED_I32_NAME]] %[[Z_DECL]]#0 -> %[[PRV_Z:.+]] : !fir.ref<i32>) {{.*}}{
+!CHECK:    %[[PRV_X_DECL:.+]]:2 = hlfir.declare %[[PRV_X]] {{.*}} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[PRV_Y_DECL:.+]]:2 = hlfir.declare %[[PRV_Y]] {{.*}} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:    %[[PRV_Z_DECL:.+]]:2 = hlfir.declare %[[PRV_Z]] {{.*}} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[LPRV_X:.+]] = fir.load %[[PRV_X_DECL]]#0 : !fir.ref<i32>
+!CHECK:    %[[RES_X:.+]] = arith.addi %[[LPRV_X]], %{{.+}} : i32
+!CHECK:    hlfir.assign %[[RES_X]] to %[[PRV_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:    %[[LPRV_Y:.+]] = fir.load %[[PRV_Y_DECL]]#0 : !fir.ref<f32>
+!CHECK:    %[[RES_Y:.+]] = arith.addf %[[LPRV_Y]], %{{.+}} : f32
+!CHECK:    hlfir.assign %[[RES_Y]] to %[[PRV_Y_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:    %[[LPRV_Z:.+]] = fir.load %[[PRV_Z_DECL]]#0 : !fir.ref<i32>
+!CHECK:    %[[RES_Z:.+]] = arith.select %{{.+}}, %[[LPRV_Z]], %{{.+}} : i32
+!CHECK:    hlfir.assign %[[RES_Z]] to %[[PRV_Z_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:    omp.yield
+!CHECK:  }
+!CHECK: return
+subroutine multiple_reduction(v)
+  implicit none
+  integer, intent(in) :: v(:)
+  integer :: i
+  integer :: x
+  real :: y
+  integer:: z
+  x = 0
+  y = 0.0
+  z = 10
+
+  !$omp do reduction(+:x,y) reduction(min:z)
+  do i=1, 100
+    x = x + v(i)
+    y = y + 1.5 * v(i)
+    z = min(z, v(i))
+  end do
+  !$omp end do
+end subroutine
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index 7afb3c5f0faa..a0d79858a896 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -61,7 +61,8 @@ if(LLVM_LIBC_FULL_BUILD OR LLVM_LIBC_GPU_BUILD)
   endif()
 endif()
 # We will build the GPU utilities if we are not doing a runtimes build.
-if(LLVM_LIBC_GPU_BUILD AND NOT LLVM_RUNTIMES_BUILD)
+option(LIBC_BUILD_GPU_LOADER "Always build the GPU loader utilities" OFF)
+if(LIBC_BUILD_GPU_LOADER OR (LLVM_LIBC_GPU_BUILD AND NOT LLVM_RUNTIMES_BUILD))
   add_subdirectory(utils/gpu)
 endif()
 
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index dbf81c284e78..6abb35ab0ead 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -195,6 +195,7 @@ set(TARGET_LIBC_ENTRYPOINTS
 
     # stdio.h entrypoints
     libc.src.stdio.remove
+    libc.src.stdio.rename
     libc.src.stdio.sprintf
     libc.src.stdio.snprintf
     libc.src.stdio.vsprintf
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index b42a55a4d712..e34c87ec6b5d 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -196,6 +196,7 @@ set(TARGET_LIBC_ENTRYPOINTS
 
     # stdio.h entrypoints
     libc.src.stdio.remove
+    libc.src.stdio.rename
     libc.src.stdio.sprintf
     libc.src.stdio.snprintf
     libc.src.stdio.fprintf
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index c216f4349627..8e1ab5cd65f0 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -198,6 +198,7 @@ set(TARGET_LIBC_ENTRYPOINTS
 
     # stdio.h entrypoints
     libc.src.stdio.remove
+    libc.src.stdio.rename
     libc.src.stdio.sprintf
     libc.src.stdio.snprintf
     libc.src.stdio.fprintf
diff --git a/libc/docs/stdio.rst b/libc/docs/stdio.rst
index 4fd6b71a0917..d17821562c25 100644
--- a/libc/docs/stdio.rst
+++ b/libc/docs/stdio.rst
@@ -68,7 +68,7 @@ These functions operate on files on the host's system, without using the
 Function_Name  Available
 =============  =========
 remove         |check|
-rename
+rename         |check|
 tmpnam
 =============  =========
 
diff --git a/libc/include/llvm-libc-macros/math-macros.h b/libc/include/llvm-libc-macros/math-macros.h
index db8a4ea65bd6..2605535b927d 100644
--- a/libc/include/llvm-libc-macros/math-macros.h
+++ b/libc/include/llvm-libc-macros/math-macros.h
@@ -30,10 +30,6 @@
 #define FP_LLOGB0 (-LONG_MAX - 1)
 #define FP_LLOGBNAN LONG_MAX
 
-#define isfinite(x) __builtin_isfinite(x)
-#define isinf(x) __builtin_isinf(x)
-#define isnan(x) __builtin_isnan(x)
-
 #ifdef __FAST_MATH__
 #define math_errhandling 0
 #elif defined(__NO_MATH_ERRNO__)
@@ -44,4 +40,32 @@
 #define math_errhandling (MATH_ERRNO | MATH_ERREXCEPT)
 #endif
 
+// These must be type-generic functions.  The C standard specifies them as
+// being macros rather than functions, in fact.  However, in C++ it's important
+// that there be function declarations that don't interfere with other uses of
+// the identifier, even in places with parentheses where a function-like macro
+// will be expanded (such as a function declaration in a C++ namespace).
+
+#ifdef __cplusplus
+
+template <typename T> inline constexpr bool isfinite(T x) {
+  return __builtin_isfinite(x);
+}
+
+template <typename T> inline constexpr bool isinf(T x) {
+  return __builtin_isinf(x);
+}
+
+template <typename T> inline constexpr bool isnan(T x) {
+  return __builtin_isnan(x);
+}
+
+#else
+
+#define isfinite(x) __builtin_isfinite(x)
+#define isinf(x) __builtin_isinf(x)
+#define isnan(x) __builtin_isnan(x)
+
+#endif
+
 #endif // LLVM_LIBC_MACROS_MATH_MACROS_H
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 920036adfed5..76010a4b4533 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -707,6 +707,11 @@ def StdC : StandardSpec<"stdc"> {
               [ArgSpec<ConstCharPtr>]
           >,
           FunctionSpec<
+              "rename",
+              RetValSpec<IntType>,
+              [ArgSpec<ConstCharPtr>, ArgSpec<ConstCharPtr>]
+          >,
+          FunctionSpec<
               "setbuf",
               RetValSpec<VoidType>,
               [ArgSpec<FILERestrictedPtr>, ArgSpec<CharRestrictedPtr>]
diff --git a/libc/src/__support/CPP/CMakeLists.txt b/libc/src/__support/CPP/CMakeLists.txt
index 6216505eae23..f76285be5219 100644
--- a/libc/src/__support/CPP/CMakeLists.txt
+++ b/libc/src/__support/CPP/CMakeLists.txt
@@ -103,23 +103,24 @@ add_header_library(
   type_traits
   HDRS
     type_traits.h
-    type_traits/always_false.h
     type_traits/add_lvalue_reference.h
     type_traits/add_pointer.h
     type_traits/add_rvalue_reference.h
+    type_traits/always_false.h
     type_traits/bool_constant.h
     type_traits/conditional.h
     type_traits/decay.h
     type_traits/enable_if.h
     type_traits/false_type.h
     type_traits/integral_constant.h
-    type_traits/invoke.h
     type_traits/invoke_result.h
+    type_traits/invoke.h
     type_traits/is_arithmetic.h
     type_traits/is_array.h
     type_traits/is_base_of.h
     type_traits/is_class.h
     type_traits/is_const.h
+    type_traits/is_constant_evaluated.h
     type_traits/is_convertible.h
     type_traits/is_destructible.h
     type_traits/is_enum.h
diff --git a/libc/src/__support/CPP/type_traits.h b/libc/src/__support/CPP/type_traits.h
index 697cf79d6ccc..1494aeb905e0 100644
--- a/libc/src/__support/CPP/type_traits.h
+++ b/libc/src/__support/CPP/type_traits.h
@@ -25,6 +25,7 @@
 #include "src/__support/CPP/type_traits/is_base_of.h"
 #include "src/__support/CPP/type_traits/is_class.h"
 #include "src/__support/CPP/type_traits/is_const.h"
+#include "src/__support/CPP/type_traits/is_constant_evaluated.h"
 #include "src/__support/CPP/type_traits/is_convertible.h"
 #include "src/__support/CPP/type_traits/is_destructible.h"
 #include "src/__support/CPP/type_traits/is_enum.h"
diff --git a/libc/src/__support/CPP/type_traits/is_constant_evaluated.h b/libc/src/__support/CPP/type_traits/is_constant_evaluated.h
new file mode 100644
index 000000000000..93cfd07b567f
--- /dev/null
+++ b/libc/src/__support/CPP/type_traits/is_constant_evaluated.h
@@ -0,0 +1,21 @@
+//===-- is_constant_evaluated type_traits -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_CONSTANT_EVALUATED_H
+#define LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_CONSTANT_EVALUATED_H
+
+#include "src/__support/macros/attributes.h"
+
+namespace LIBC_NAMESPACE::cpp {
+
+LIBC_INLINE constexpr bool is_constant_evaluated() {
+  return __builtin_is_constant_evaluated();
+}
+
+} // namespace LIBC_NAMESPACE::cpp
+
+#endif // LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_CONSTANT_EVALUATED_H
diff --git a/libc/src/stdio/CMakeLists.txt b/libc/src/stdio/CMakeLists.txt
index ece93fd56ef0..11e15c917351 100644
--- a/libc/src/stdio/CMakeLists.txt
+++ b/libc/src/stdio/CMakeLists.txt
@@ -256,6 +256,13 @@ add_entrypoint_object(
     .${LIBC_TARGET_OS}.remove
 )
 
+add_entrypoint_object(
+  rename
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.rename
+)
+
 # These entrypoints have multiple potential implementations.
 add_stdio_entrypoint_object(feof)
 add_stdio_entrypoint_object(feof_unlocked)
diff --git a/libc/src/stdio/linux/CMakeLists.txt b/libc/src/stdio/linux/CMakeLists.txt
index 774f24b2db0b..a08ff0ba4832 100644
--- a/libc/src/stdio/linux/CMakeLists.txt
+++ b/libc/src/stdio/linux/CMakeLists.txt
@@ -12,3 +12,15 @@ add_entrypoint_object(
     libc.src.__support.OSUtil.osutil
     libc.src.errno.errno
 )
+
+add_entrypoint_object(
+  rename
+  SRCS
+    rename.cpp
+  HDRS
+    ../rename.h
+  DEPENDS
+    libc.include.sys_syscall
+    libc.src.__support.OSUtil.osutil
+    libc.src.errno.errno
+)
diff --git a/libc/src/stdio/linux/rename.cpp b/libc/src/stdio/linux/rename.cpp
new file mode 100644
index 000000000000..379a6ef3c0c8
--- /dev/null
+++ b/libc/src/stdio/linux/rename.cpp
@@ -0,0 +1,28 @@
+//===-- Linux implementation of rename ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/rename.h"
+#include "include/llvm-libc-macros/linux/fcntl-macros.h"
+#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "src/__support/common.h"
+#include "src/errno/libc_errno.h"
+#include <sys/syscall.h> // For syscall numbers.
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, rename, (const char *oldpath, const char *newpath)) {
+  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_renameat2, AT_FDCWD, oldpath,
+                                              AT_FDCWD, newpath, 0);
+
+  if (ret >= 0)
+    return 0;
+  libc_errno = -ret;
+  return -1;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/stdio/rename.h b/libc/src/stdio/rename.h
new file mode 100644
index 000000000000..eadda7c3eac9
--- /dev/null
+++ b/libc/src/stdio/rename.h
@@ -0,0 +1,18 @@
+//===-- Implementation header of rename -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDIO_RENAME_H
+#define LLVM_LIBC_SRC_STDIO_RENAME_H
+
+namespace LIBC_NAMESPACE {
+
+int rename(const char *oldpath, const char *newpath);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDIO_RENAME_H
diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt
index 3ccce16a76a2..4c38e8aba7d7 100644
--- a/libc/test/src/stdio/CMakeLists.txt
+++ b/libc/test/src/stdio/CMakeLists.txt
@@ -354,6 +354,21 @@ if(${LIBC_TARGET_OS} STREQUAL "linux")
       libc.src.unistd.access
       libc.src.unistd.close
   )
+
+  add_libc_test(
+    rename_test
+    SUITE
+      libc_stdio_unittests
+    SRCS
+      rename_test.cpp
+    DEPENDS
+      libc.src.errno.errno
+      libc.src.fcntl.open
+      libc.src.stdio.rename
+      libc.src.unistd.access
+      libc.src.unistd.close
+      libc.test.UnitTest.ErrnoSetterMatcher
+  )
 endif()
 
 add_libc_test(
diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp
new file mode 100644
index 000000000000..a5dd734c6361
--- /dev/null
+++ b/libc/test/src/stdio/rename_test.cpp
@@ -0,0 +1,51 @@
+//===-- Unittests for rename ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "include/llvm-libc-macros/linux/sys-stat-macros.h"
+#include "include/llvm-libc-macros/linux/unistd-macros.h"
+#include "src/errno/libc_errno.h"
+#include "src/fcntl/open.h"
+#include "src/stdio/rename.h"
+#include "src/unistd/access.h"
+#include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcRenameTest, CreateAndRenameFile) {
+  // The test strategy is to create a file and rename it, and also verify that
+  // it was renamed.
+  LIBC_NAMESPACE::libc_errno = 0;
+  using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
+  using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+
+  constexpr const char *FILENAME0 = "rename.test.file0";
+  auto TEST_FILEPATH0 = libc_make_test_file_path(FILENAME0);
+
+  int fd = LIBC_NAMESPACE::open(TEST_FILEPATH0, O_WRONLY | O_CREAT, S_IRWXU);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(fd, 0);
+  ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0));
+  ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILEPATH0, F_OK), Succeeds(0));
+
+  constexpr const char *FILENAME1 = "rename.test.file1";
+  auto TEST_FILEPATH1 = libc_make_test_file_path(FILENAME1);
+  ASSERT_THAT(LIBC_NAMESPACE::rename(TEST_FILEPATH0, TEST_FILEPATH1),
+              Succeeds(0));
+  ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILEPATH1, F_OK), Succeeds(0));
+  ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILEPATH0, F_OK), Fails(ENOENT));
+}
+
+TEST(LlvmLibcRenameTest, RenameNonExistent) {
+  using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
+
+  constexpr const char *FILENAME1 = "rename.test.file1";
+  auto TEST_FILEPATH1 = libc_make_test_file_path(FILENAME1);
+
+  ASSERT_THAT(LIBC_NAMESPACE::rename("non-existent", TEST_FILEPATH1),
+              Fails(ENOENT));
+}
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index c70ae477fafc..cac42f9c3c3f 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -70,7 +70,8 @@ Deprecations and Removals
 - The ``_LIBCPP_ENABLE_NARROWING_CONVERSIONS_IN_VARIANT`` macro that changed the behavior for narrowing conversions
   in ``std::variant`` has been removed in LLVM 19.
 
-- TODO: The ``_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS`` macro has been removed in LLVM 19.
+- The ``_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS`` and ``_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_VOID_SPECIALIZATION``
+  macros have been removed in LLVM 19.
 
 - TODO: The ``_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES`` and ``_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES`` macros have
   been removed in LLVM 19. C++17 and C++20 removed features can still be re-enabled individually.
diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv
index a10319235006..8de265f4d1b6 100644
--- a/libcxx/docs/Status/Cxx23Issues.csv
+++ b/libcxx/docs/Status/Cxx23Issues.csv
@@ -295,7 +295,7 @@
 "`3847 <https://wg21.link/LWG3847>`__","``ranges::to`` can still return views","February 2023","|Complete|","17.0","|ranges|"
 "`3862 <https://wg21.link/LWG3862>`__","``basic_const_iterator``'s ``common_type`` specialization is underconstrained","February 2023","","",""
 "`3865 <https://wg21.link/LWG3865>`__","Sorting a range of ``pairs``","February 2023","|Complete|","17.0","|ranges|"
-"`3869 <https://wg21.link/LWG3869>`__","Deprecate ``std::errc`` constants related to UNIX STREAMS","February 2023","","",""
+"`3869 <https://wg21.link/LWG3869>`__","Deprecate ``std::errc`` constants related to UNIX STREAMS","February 2023","|Complete|","19.0",""
 "`3870 <https://wg21.link/LWG3870>`__","Remove ``voidify``","February 2023","","",""
 "`3871 <https://wg21.link/LWG3871>`__","Adjust note about ``terminate``","February 2023","","",""
 "`3872 <https://wg21.link/LWG3872>`__","``basic_const_iterator`` should have custom ``iter_move``","February 2023","","",""
diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index 3b1be286c169..ac12b0b96950 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -244,18 +244,6 @@ C++20 Specific Configuration Macros
   This macro is deprecated and will be removed in LLVM-19. Use the
   individual macros listed below.
 
-**_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS**:
-  This macro is used to re-enable redundant members of `allocator<T>`,
-  including `pointer`, `reference`, `rebind`, `address`, `max_size`,
-  `construct`, `destroy`, and the two-argument overload of `allocate`.
-  This macro has been deprecated and will be removed in LLVM-19.
-
-**_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_VOID_SPECIALIZATION**:
-  This macro is used to re-enable the library-provided specializations of
-  `allocator<void>` and `allocator<const void>`.
-  Use it in conjunction with `_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS`
-  to ensure that removed members of `allocator<void>` can be accessed.
-
 **_LIBCPP_ENABLE_CXX20_REMOVED_BINDER_TYPEDEFS**:
   This macro is used to re-enable the `argument_type`, `result_type`,
   `first_argument_type`, and `second_argument_type` members of class
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 11e13e0c2498..132cace3d5b2 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -1238,8 +1238,6 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
 #  endif // _LIBCPP_ENABLE_CXX17_REMOVED_FEATURES
 
 #  if defined(_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES)
-#    define _LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS
-#    define _LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_VOID_SPECIALIZATION
 #    define _LIBCPP_ENABLE_CXX20_REMOVED_BINDER_TYPEDEFS
 #    define _LIBCPP_ENABLE_CXX20_REMOVED_NEGATORS
 #    define _LIBCPP_ENABLE_CXX20_REMOVED_RAW_STORAGE_ITERATOR
diff --git a/libcxx/include/__memory/allocator.h b/libcxx/include/__memory/allocator.h
index 4e6303914c38..26e5d4978b15 100644
--- a/libcxx/include/__memory/allocator.h
+++ b/libcxx/include/__memory/allocator.h
@@ -31,18 +31,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Tp>
 class allocator;
 
-#if defined(_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS) && !defined(_LIBCPP_DISABLE_DEPRECATION_WARNINGS)
-#  pragma clang deprecated(                                                                                            \
-      _LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS,                                                                  \
-      "_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS is deprecated in LLVM 18 and will be removed in LLVM 19")
-#endif
-
-#if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_VOID_SPECIALIZATION)
+#if _LIBCPP_STD_VER <= 17
 // These specializations shouldn't be marked _LIBCPP_DEPRECATED_IN_CXX17.
 // Specializing allocator<void> is deprecated, but not using it.
 template <>
 class _LIBCPP_TEMPLATE_VIS allocator<void> {
-#  if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS)
+#  if _LIBCPP_STD_VER <= 17
 
 public:
   _LIBCPP_DEPRECATED_IN_CXX17 typedef void* pointer;
@@ -58,7 +52,7 @@ public:
 
 template <>
 class _LIBCPP_TEMPLATE_VIS allocator<const void> {
-#  if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS)
+#  if _LIBCPP_STD_VER <= 17
 
 public:
   _LIBCPP_DEPRECATED_IN_CXX17 typedef const void* pointer;
@@ -141,7 +135,7 @@ public:
   }
 
   // C++20 Removed members
-#if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS)
+#if _LIBCPP_STD_VER <= 17
   _LIBCPP_DEPRECATED_IN_CXX17 typedef _Tp* pointer;
   _LIBCPP_DEPRECATED_IN_CXX17 typedef const _Tp* const_pointer;
   _LIBCPP_DEPRECATED_IN_CXX17 typedef _Tp& reference;
@@ -221,7 +215,7 @@ public:
   }
 
   // C++20 Removed members
-#if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS)
+#if _LIBCPP_STD_VER <= 17
   _LIBCPP_DEPRECATED_IN_CXX17 typedef const _Tp* pointer;
   _LIBCPP_DEPRECATED_IN_CXX17 typedef const _Tp* const_pointer;
   _LIBCPP_DEPRECATED_IN_CXX17 typedef const _Tp& reference;
diff --git a/libcxx/include/__string/char_traits.h b/libcxx/include/__string/char_traits.h
index 5880d3a22db2..47ed1057caaa 100644
--- a/libcxx/include/__string/char_traits.h
+++ b/libcxx/include/__string/char_traits.h
@@ -9,7 +9,6 @@
 #ifndef _LIBCPP___STRING_CHAR_TRAITS_H
 #define _LIBCPP___STRING_CHAR_TRAITS_H
 
-#include <__algorithm/copy_n.h>
 #include <__algorithm/fill_n.h>
 #include <__algorithm/find_end.h>
 #include <__algorithm/find_first_of.h>
@@ -144,7 +143,7 @@ struct _LIBCPP_TEMPLATE_VIS char_traits<char> {
   copy(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT {
     _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(!std::__is_pointer_in_range(__s1, __s1 + __n, __s2),
                                           "char_traits::copy: source and destination ranges overlap");
-    std::copy_n(__s2, __n, __s1);
+    std::__constexpr_memmove(__s1, __s2, __element_count(__n));
     return __s1;
   }
 
@@ -221,7 +220,7 @@ struct _LIBCPP_TEMPLATE_VIS char_traits<wchar_t> {
   copy(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT {
     _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(!std::__is_pointer_in_range(__s1, __s1 + __n, __s2),
                                           "char_traits::copy: source and destination ranges overlap");
-    std::copy_n(__s2, __n, __s1);
+    std::__constexpr_memmove(__s1, __s2, __element_count(__n));
     return __s1;
   }
 
@@ -287,7 +286,7 @@ struct _LIBCPP_TEMPLATE_VIS char_traits<char8_t> {
   copy(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT {
     _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(!std::__is_pointer_in_range(__s1, __s1 + __n, __s2),
                                           "char_traits::copy: source and destination ranges overlap");
-    std::copy_n(__s2, __n, __s1);
+    std::__constexpr_memmove(__s1, __s2, __element_count(__n));
     return __s1;
   }
 
@@ -366,7 +365,7 @@ struct _LIBCPP_TEMPLATE_VIS char_traits<char16_t> {
   copy(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT {
     _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(!std::__is_pointer_in_range(__s1, __s1 + __n, __s2),
                                           "char_traits::copy: source and destination ranges overlap");
-    std::copy_n(__s2, __n, __s1);
+    std::__constexpr_memmove(__s1, __s2, __element_count(__n));
     return __s1;
   }
 
@@ -454,7 +453,7 @@ struct _LIBCPP_TEMPLATE_VIS char_traits<char32_t> {
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 static char_type*
   copy(char_type* __s1, const char_type* __s2, size_t __n) _NOEXCEPT {
-    std::copy_n(__s2, __n, __s1);
+    std::__constexpr_memmove(__s1, __s2, __element_count(__n));
     return __s1;
   }
 
diff --git a/libcxx/include/__system_error/errc.h b/libcxx/include/__system_error/errc.h
index f87df86a71e1..e9f3656b7b9c 100644
--- a/libcxx/include/__system_error/errc.h
+++ b/libcxx/include/__system_error/errc.h
@@ -58,18 +58,18 @@ enum class errc
     no_child_process,                   // ECHILD
     no_link,                            // ENOLINK
     no_lock_available,                  // ENOLCK
-    no_message_available,               // ENODATA
+    no_message_available,               // ENODATA         // deprecated
     no_message,                         // ENOMSG
     no_protocol_option,                 // ENOPROTOOPT
     no_space_on_device,                 // ENOSPC
-    no_stream_resources,                // ENOSR
+    no_stream_resources,                // ENOSR           // deprecated
     no_such_device_or_address,          // ENXIO
     no_such_device,                     // ENODEV
     no_such_file_or_directory,          // ENOENT
     no_such_process,                    // ESRCH
     not_a_directory,                    // ENOTDIR
     not_a_socket,                       // ENOTSOCK
-    not_a_stream,                       // ENOSTR
+    not_a_stream,                       // ENOSTR          // deprecated
     not_connected,                      // ENOTCONN
     not_enough_memory,                  // ENOMEM
     not_supported,                      // ENOTSUP
@@ -87,7 +87,7 @@ enum class errc
     resource_unavailable_try_again,     // EAGAIN
     result_out_of_range,                // ERANGE
     state_not_recoverable,              // ENOTRECOVERABLE
-    stream_timeout,                     // ETIME
+    stream_timeout,                     // ETIME           // deprecated
     text_file_busy,                     // ETXTBSY
     timed_out,                          // ETIMEDOUT
     too_many_files_open_in_system,      // ENFILE
@@ -107,12 +107,34 @@ enum class errc
 #  pragma GCC system_header
 #endif
 
+// The method of pushing and popping the diagnostics fails for GCC.  GCC does
+// not recognize the pragma's used to generate deprecated diagnostics for
+// macros. So GCC does not need the pushing and popping.
+//
+// TODO Remove this when the deprecated constants are removed.
+#if defined(_LIBCPP_COMPILER_CLANG_BASED)
+#  define _LIBCPP_SUPPRESS_DEPRECATED_ERRC_PUSH _LIBCPP_SUPPRESS_DEPRECATED_PUSH
+#  define _LIBCPP_SUPPRESS_DEPRECATED_ERRC_POP _LIBCPP_SUPPRESS_DEPRECATED_POP
+#else
+#  define _LIBCPP_SUPPRESS_DEPRECATED_ERRC_PUSH
+#  define _LIBCPP_SUPPRESS_DEPRECATED_ERRC_POP
+#endif
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 // Some error codes are not present on all platforms, so we provide equivalents
 // for them:
 
 // enum class errc
+//
+// LWG3869 deprecates the UNIX STREAMS macros and enum values.
+// This makes the code clumbersome:
+// - the enum value is deprecated and should show a diagnostic,
+// - the macro is deprecated and should _not_ show a diagnostic in this
+//   context, and
+// - the macro is not always available.
+// This leads to the odd pushing and popping of the deprecated
+// diagnostic.
 _LIBCPP_DECLARE_STRONG_ENUM(errc){
     address_family_not_supported       = EAFNOSUPPORT,
     address_in_use                     = EADDRINUSE,
@@ -154,30 +176,48 @@ _LIBCPP_DECLARE_STRONG_ENUM(errc){
     no_child_process                   = ECHILD,
     no_link                            = ENOLINK,
     no_lock_available                  = ENOLCK,
+    // clang-format off
+    no_message_available _LIBCPP_DEPRECATED =
+    _LIBCPP_SUPPRESS_DEPRECATED_ERRC_PUSH
 #ifdef ENODATA
-    no_message_available = ENODATA,
+                                              ENODATA
 #else
-    no_message_available = ENOMSG,
+                                              ENOMSG
 #endif
+    _LIBCPP_SUPPRESS_DEPRECATED_ERRC_POP
+    ,
+    // clang-format on
     no_message         = ENOMSG,
     no_protocol_option = ENOPROTOOPT,
     no_space_on_device = ENOSPC,
+    // clang-format off
+    no_stream_resources _LIBCPP_DEPRECATED =
+    _LIBCPP_SUPPRESS_DEPRECATED_ERRC_PUSH
 #ifdef ENOSR
-    no_stream_resources = ENOSR,
+                                              ENOSR
 #else
-    no_stream_resources = ENOMEM,
+                                              ENOMEM
 #endif
+    _LIBCPP_SUPPRESS_DEPRECATED_ERRC_POP
+    ,
+    // clang-format on
     no_such_device_or_address = ENXIO,
     no_such_device            = ENODEV,
     no_such_file_or_directory = ENOENT,
     no_such_process           = ESRCH,
     not_a_directory           = ENOTDIR,
     not_a_socket              = ENOTSOCK,
+    // clang-format off
+    not_a_stream _LIBCPP_DEPRECATED =
+    _LIBCPP_SUPPRESS_DEPRECATED_ERRC_PUSH
 #ifdef ENOSTR
-    not_a_stream = ENOSTR,
+                                      ENOSTR
 #else
-    not_a_stream = EINVAL,
+                                      EINVAL
 #endif
+    _LIBCPP_SUPPRESS_DEPRECATED_ERRC_POP
+    ,
+    // clang-format on
     not_connected                  = ENOTCONN,
     not_enough_memory              = ENOMEM,
     not_supported                  = ENOTSUP,
@@ -195,11 +235,17 @@ _LIBCPP_DECLARE_STRONG_ENUM(errc){
     resource_unavailable_try_again = EAGAIN,
     result_out_of_range            = ERANGE,
     state_not_recoverable          = ENOTRECOVERABLE,
+    // clang-format off
+    stream_timeout _LIBCPP_DEPRECATED =
+    _LIBCPP_SUPPRESS_DEPRECATED_ERRC_PUSH
 #ifdef ETIME
-    stream_timeout = ETIME,
+                                        ETIME
 #else
-    stream_timeout = ETIMEDOUT,
+                                        ETIMEDOUT
 #endif
+    _LIBCPP_SUPPRESS_DEPRECATED_ERRC_POP
+    ,
+    // clang-format on
     text_file_busy                = ETXTBSY,
     timed_out                     = ETIMEDOUT,
     too_many_files_open_in_system = ENFILE,
diff --git a/libcxx/include/cerrno b/libcxx/include/cerrno
index d488fa72a54b..6171ae31f184 100644
--- a/libcxx/include/cerrno
+++ b/libcxx/include/cerrno
@@ -38,4 +38,17 @@ Macros:
 #  pragma GCC system_header
 #endif
 
+#ifdef ENODATA
+#  pragma clang deprecated(ENODATA, "ENODATA is deprecated in ISO C++")
+#endif
+#ifdef ENOSR
+#  pragma clang deprecated(ENOSR, "ENOSR is deprecated in ISO C++")
+#endif
+#ifdef ENOSTR
+#  pragma clang deprecated(ENOSTR, "ENOSTR is deprecated in ISO C++")
+#endif
+#ifdef ETIME
+#  pragma clang deprecated(ETIME, "ETIME is deprecated in ISO C++")
+#endif
+
 #endif // _LIBCPP_CERRNO
diff --git a/libcxx/modules/CMakeLists.txt b/libcxx/modules/CMakeLists.txt
index 0dea8cfca94a..d47d19a47553 100644
--- a/libcxx/modules/CMakeLists.txt
+++ b/libcxx/modules/CMakeLists.txt
@@ -206,9 +206,20 @@ add_custom_target(generate-cxx-modules
 # Configure the modules manifest.
 # Use the relative path between the installation and the module in the json
 # file. This allows moving the entire installation to a different location.
+if("${CMAKE_INSTALL_PREFIX}" STREQUAL "")
+  set(BASE_DIRECTORY "/")
+else()
+  set(BASE_DIRECTORY ${CMAKE_INSTALL_PREFIX})
+endif()
+cmake_path(ABSOLUTE_PATH LIBCXX_INSTALL_LIBRARY_DIR
+  BASE_DIRECTORY ${BASE_DIRECTORY}
+  OUTPUT_VARIABLE ABS_LIBRARY_DIR)
+cmake_path(ABSOLUTE_PATH LIBCXX_INSTALL_MODULES_DIR
+  BASE_DIRECTORY ${BASE_DIRECTORY}
+  OUTPUT_VARIABLE ABS_MODULES_DIR)
 file(RELATIVE_PATH LIBCXX_MODULE_RELATIVE_PATH
-  ${CMAKE_INSTALL_PREFIX}/${LIBCXX_INSTALL_LIBRARY_DIR}
-  ${CMAKE_INSTALL_PREFIX}/${LIBCXX_INSTALL_MODULES_DIR})
+  ${ABS_LIBRARY_DIR}
+  ${ABS_MODULES_DIR})
 configure_file(
   "modules.json.in"
   "${LIBCXX_LIBRARY_DIR}/libc++.modules.json"
diff --git a/libcxx/src/random.cpp b/libcxx/src/random.cpp
index c7073c54da6b..93590af310e5 100644
--- a/libcxx/src/random.cpp
+++ b/libcxx/src/random.cpp
@@ -79,8 +79,10 @@ unsigned random_device::operator()() {
   char* p  = reinterpret_cast<char*>(&r);
   while (n > 0) {
     ssize_t s = read(__f_, p, n);
+    _LIBCPP_SUPPRESS_DEPRECATED_PUSH
     if (s == 0)
-      __throw_system_error(ENODATA, "random_device got EOF");
+      __throw_system_error(ENODATA, "random_device got EOF"); // TODO ENODATA -> ENOMSG
+    _LIBCPP_SUPPRESS_DEPRECATED_POP
     if (s == -1) {
       if (errno != EINTR)
         __throw_system_error(errno, "random_device got an unexpected error");
diff --git a/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/address.cxx2a.pass.cpp b/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/address.cxx20.pass.cpp
index 59657ca46a14..d9a65eee4c13 100644
--- a/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/address.cxx2a.pass.cpp
+++ b/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/address.cxx20.pass.cpp
@@ -12,11 +12,9 @@
 // pointer address(reference x) const;
 // const_pointer address(const_reference x) const;
 
-//  In C++20, parts of std::allocator<T> have been removed.
-//  However, for backwards compatibility, if _LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS
-//  is defined before including <memory>, then removed members will be restored.
+// Removed in C++20, deprecated in C++17.
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS
+// REQUIRES: c++03 || c++11 || c++14 || c++17
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
 
 #include <memory>
@@ -25,25 +23,22 @@
 #include "test_macros.h"
 
 template <class T>
-void test_address()
-{
-    T* tp = new T();
-    const T* ctp = tp;
-    const std::allocator<T> a;
-    assert(a.address(*tp) == tp);
-    assert(a.address(*ctp) == tp);
-    delete tp;
+void test_address() {
+  T* tp        = new T();
+  const T* ctp = tp;
+  const std::allocator<T> a;
+  assert(a.address(*tp) == tp);
+  assert(a.address(*ctp) == tp);
+  delete tp;
 }
 
-struct A
-{
-    void operator&() const {}
+struct A {
+  void operator&() const {}
 };
 
-int main(int, char**)
-{
-    test_address<int>();
-    test_address<A>();
+int main(int, char**) {
+  test_address<int>();
+  test_address<A>();
 
   return 0;
 }
diff --git a/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/address.cxx20.verify.cpp b/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/address.cxx20.verify.cpp
new file mode 100644
index 000000000000..21fd4d23449b
--- /dev/null
+++ b/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/address.cxx20.verify.cpp
@@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <memory>
+
+// allocator:
+// pointer address(reference x) const;
+// const_pointer address(const_reference x) const;
+
+// In C++20, parts of std::allocator<T> have been removed.
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <memory>
+#include <cassert>
+
+#include "test_macros.h"
+
+template <class T>
+void test_address() {
+  T* tp        = new T();
+  const T* ctp = tp;
+  const std::allocator<T> a;
+  assert(a.address(*tp) == tp);  // expected-error 2 {{no member}}
+  assert(a.address(*ctp) == tp); // expected-error 2 {{no member}}
+  delete tp;
+}
+
+struct A {
+  void operator&() const {}
+};
+
+int main(int, char**) {
+  test_address<int>();
+  test_address<A>();
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/address.depr_in_cxx17.verify.cpp b/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/address.depr_in_cxx17.verify.cpp
index 83d059a838ff..4098bdb2ee92 100644
--- a/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/address.depr_in_cxx17.verify.cpp
+++ b/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/address.depr_in_cxx17.verify.cpp
@@ -14,9 +14,7 @@
 
 // Deprecated in C++17
 
-// UNSUPPORTED: c++03, c++11, c++14
-
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS -Wno-deprecated-pragma
+// REQUIRES: c++17
 
 #include <memory>
 
diff --git a/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/allocate.cxx2a.pass.cpp b/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/allocate.cxx20.pass.cpp
index f2fb606ee6db..8fc6628ebfba 100644
--- a/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/allocate.cxx2a.pass.cpp
+++ b/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/allocate.cxx20.pass.cpp
@@ -11,17 +11,18 @@
 // allocator:
 // T* allocate(size_t n, const void* hint);
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS
+// Removed in C++20, deprecated in C++17.
+
+// REQUIRES: c++03 || c++11 || c++14 || c++17
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
 
 #include <memory>
 #include <cassert>
-#include <cstddef>       // for std::max_align_t
+#include <cstddef> // for std::max_align_t
 
 #include "test_macros.h"
 #include "count_new.h"
 
-
 #ifdef TEST_HAS_NO_ALIGNED_ALLOCATION
 static const bool UsingAlignedNew = false;
 #else
@@ -36,7 +37,6 @@ static const std::size_t MaxAligned = std::alignment_of<std::max_align_t>::value
 
 static const std::size_t OverAligned = MaxAligned * 2;
 
-
 template <std::size_t Align>
 struct TEST_ALIGNAS(Align) AlignedType {
   char data;
@@ -48,7 +48,6 @@ struct TEST_ALIGNAS(Align) AlignedType {
 template <std::size_t Align>
 int AlignedType<Align>::constructed = 0;
 
-
 template <std::size_t Align>
 void test_aligned() {
   typedef AlignedType<Align> T;
@@ -56,11 +55,11 @@ void test_aligned() {
   globalMemCounter.reset();
   std::allocator<T> a;
   const bool IsOverAlignedType = Align > MaxAligned;
-  const bool ExpectAligned = IsOverAlignedType && UsingAlignedNew;
+  const bool ExpectAligned     = IsOverAlignedType && UsingAlignedNew;
   {
-    globalMemCounter.last_new_size = 0;
+    globalMemCounter.last_new_size  = 0;
     globalMemCounter.last_new_align = 0;
-    T* ap2 = a.allocate(11, (const void*)5);
+    T* ap2                          = a.allocate(11, (const void*)5);
     DoNotOptimize(ap2);
     assert(globalMemCounter.checkOutstandingNewEq(1));
     assert(globalMemCounter.checkNewCalledEq(1));
@@ -80,14 +79,14 @@ void test_aligned() {
 }
 
 int main(int, char**) {
-    test_aligned<1>();
-    test_aligned<2>();
-    test_aligned<4>();
-    test_aligned<8>();
-    test_aligned<16>();
-    test_aligned<MaxAligned>();
-    test_aligned<OverAligned>();
-    test_aligned<OverAligned * 2>();
+  test_aligned<1>();
+  test_aligned<2>();
+  test_aligned<4>();
+  test_aligned<8>();
+  test_aligned<16>();
+  test_aligned<MaxAligned>();
+  test_aligned<OverAligned>();
+  test_aligned<OverAligned * 2>();
 
   return 0;
 }
diff --git a/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/allocate.cxx20.verify.cpp b/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/allocate.cxx20.verify.cpp
new file mode 100644
index 000000000000..bf02c7570d39
--- /dev/null
+++ b/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/allocate.cxx20.verify.cpp
@@ -0,0 +1,23 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// <memory>
+
+// allocator:
+// T* allocate(size_t n, const void* hint);
+
+// Removed in C++20.
+
+#include <memory>
+
+void f() {
+  std::allocator<int> a;
+  a.allocate(3, nullptr); // expected-error {{too many arguments to function call}}
+}
diff --git a/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/allocate.cxx2a.verify.cpp b/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/allocate.cxx2a.verify.cpp
deleted file mode 100644
index 2289cd6cd404..000000000000
--- a/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/allocate.cxx2a.verify.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14, c++17
-
-// <memory>
-
-// allocator:
-// T* allocate(size_t n, const void* hint);
-
-//  In C++20, parts of std::allocator<T> have been removed.
-//  However, for backwards compatibility, if _LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS
-//  is defined before including <memory>, then removed members will be restored.
-
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
-
-#include <memory>
-
-void f() {
-    std::allocator<int> a;
-    a.allocate(3, nullptr); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-}
diff --git a/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/allocate.depr_in_cxx17.verify.cpp b/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/allocate.depr_in_cxx17.verify.cpp
index 8b2e862e9503..8629df3c4164 100644
--- a/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/allocate.depr_in_cxx17.verify.cpp
+++ b/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/allocate.depr_in_cxx17.verify.cpp
@@ -13,9 +13,7 @@
 
 // Deprecated in C++17
 
-// UNSUPPORTED: c++03, c++11, c++14
-
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS -Wno-deprecated-pragma
+// REQUIRES: c++17
 
 #include <memory>
 
diff --git a/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/construct.cxx2a.pass.cpp b/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/construct.cxx20.pass.cpp
index d3a7dadbbe11..9a37cf8af8e6 100644
--- a/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/construct.cxx2a.pass.cpp
+++ b/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/construct.cxx20.pass.cpp
@@ -11,12 +11,10 @@
 // allocator:
 // template <class... Args> void construct(pointer p, Args&&... args);
 
-//  In C++20, parts of std::allocator<T> have been removed.
-//  However, for backwards compatibility, if _LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS
-//  is defined before including <memory>, then removed members will be restored.
-
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS
+// In C++20, parts of std::allocator<T> have been removed.
+// In C++17, they were deprecated.
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// REQUIRES: c++03 || c++11 || c++14 || c++17
 
 #include <memory>
 #include <cassert>
@@ -26,42 +24,39 @@
 
 int A_constructed = 0;
 
-struct A
-{
-    int data;
-    A() {++A_constructed;}
+struct A {
+  int data;
+  A() { ++A_constructed; }
 
-    A(const A&) {++A_constructed;}
+  A(const A&) { ++A_constructed; }
 
-    explicit A(int) {++A_constructed;}
-    A(int, int*) {++A_constructed;}
+  explicit A(int) { ++A_constructed; }
+  A(int, int*) { ++A_constructed; }
 
-    ~A() {--A_constructed;}
+  ~A() { --A_constructed; }
 };
 
 int move_only_constructed = 0;
 
 #if TEST_STD_VER >= 11
-class move_only
-{
-    move_only(const move_only&) = delete;
-    move_only& operator=(const move_only&)= delete;
+class move_only {
+  move_only(const move_only&)            = delete;
+  move_only& operator=(const move_only&) = delete;
 
 public:
-    move_only(move_only&&) {++move_only_constructed;}
-    move_only& operator=(move_only&&) {return *this;}
+  move_only(move_only&&) { ++move_only_constructed; }
+  move_only& operator=(move_only&&) { return *this; }
 
-    move_only() {++move_only_constructed;}
-    ~move_only() {--move_only_constructed;}
+  move_only() { ++move_only_constructed; }
+  ~move_only() { --move_only_constructed; }
 
 public:
-    int data; // unused other than to make sizeof(move_only) == sizeof(int).
-              // but public to suppress "-Wunused-private-field"
+  int data; // unused other than to make sizeof(move_only) == sizeof(int).
+            // but public to suppress "-Wunused-private-field"
 };
 #endif // TEST_STD_VER >= 11
 
-int main(int, char**)
-{
+int main(int, char**) {
   globalMemCounter.reset();
   {
     std::allocator<A> a;
@@ -69,7 +64,7 @@ int main(int, char**)
     assert(A_constructed == 0);
 
     globalMemCounter.last_new_size = 0;
-    A* ap = a.allocate(3);
+    A* ap                          = a.allocate(3);
     DoNotOptimize(ap);
     assert(globalMemCounter.checkOutstandingNewEq(1));
     assert(globalMemCounter.checkLastNewSizeEq(3 * sizeof(int)));
@@ -113,13 +108,13 @@ int main(int, char**)
     assert(A_constructed == 0);
   }
 #if TEST_STD_VER >= 11
-    {
+  {
     std::allocator<move_only> a;
     assert(globalMemCounter.checkOutstandingNewEq(0));
     assert(move_only_constructed == 0);
 
     globalMemCounter.last_new_size = 0;
-    move_only* ap = a.allocate(3);
+    move_only* ap                  = a.allocate(3);
     DoNotOptimize(ap);
     assert(globalMemCounter.checkOutstandingNewEq(1));
     assert(globalMemCounter.checkLastNewSizeEq(3 * sizeof(int)));
@@ -145,7 +140,7 @@ int main(int, char**)
     DoNotOptimize(ap);
     assert(globalMemCounter.checkOutstandingNewEq(0));
     assert(move_only_constructed == 0);
-    }
+  }
 #endif
 
   return 0;
diff --git a/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/construct.cxx20.verify.cpp b/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/construct.cxx20.verify.cpp
new file mode 100644
index 000000000000..b39f9d918c95
--- /dev/null
+++ b/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/construct.cxx20.verify.cpp
@@ -0,0 +1,77 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <memory>
+
+// allocator:
+// template <class... Args> void construct(pointer p, Args&&... args);
+
+// Removed in C++20.
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <memory>
+#include <cassert>
+
+int A_constructed = 0;
+
+struct A {
+  int data;
+  A() { ++A_constructed; }
+
+  A(const A&) { ++A_constructed; }
+
+  explicit A(int) { ++A_constructed; }
+  A(int, int*) { ++A_constructed; }
+
+  ~A() { --A_constructed; }
+};
+
+int move_only_constructed = 0;
+
+class move_only {
+  move_only(const move_only&)            = delete;
+  move_only& operator=(const move_only&) = delete;
+
+public:
+  move_only(move_only&&) { ++move_only_constructed; }
+  move_only& operator=(move_only&&) { return *this; }
+
+  move_only() { ++move_only_constructed; }
+  ~move_only() { --move_only_constructed; }
+
+public:
+  int data; // unused other than to make sizeof(move_only) == sizeof(int).
+            // but public to suppress "-Wunused-private-field"
+};
+
+int main(int, char**) {
+  {
+    std::allocator<A> a;
+    A* ap = a.allocate(3);
+    a.construct(ap);             // expected-error {{no member}}
+    a.destroy(ap);               // expected-error {{no member}}
+    a.construct(ap, A());        // expected-error {{no member}}
+    a.destroy(ap);               // expected-error {{no member}}
+    a.construct(ap, 5);          // expected-error {{no member}}
+    a.destroy(ap);               // expected-error {{no member}}
+    a.construct(ap, 5, (int*)0); // expected-error {{no member}}
+    a.destroy(ap);               // expected-error {{no member}}
+    a.deallocate(ap, 3);
+  }
+  {
+    std::allocator<move_only> a;
+    move_only* ap = a.allocate(3);
+    a.construct(ap);              // expected-error {{no member}}
+    a.destroy(ap);                // expected-error {{no member}}
+    a.construct(ap, move_only()); // expected-error {{no member}}
+    a.destroy(ap);                // expected-error {{no member}}
+    a.deallocate(ap, 3);
+  }
+  return 0;
+}
diff --git a/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/max_size.cxx2a.pass.cpp b/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/max_size.cxx20.pass.cpp
index b07568355fee..92e3b919b0f7 100644
--- a/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/max_size.cxx2a.pass.cpp
+++ b/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/max_size.cxx20.pass.cpp
@@ -11,11 +11,9 @@
 // allocator:
 // size_type max_size() const throw();
 
-//  In C++20, parts of std::allocator<T> have been removed.
-//  However, for backwards compatibility, if _LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS
-//  is defined before including <memory>, then removed members will be restored.
+// Removed in C++20, deprecated in C++17.
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS
+// REQUIRES: c++03 || c++11 || c++14 || c++17
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
 
 #include <memory>
@@ -27,11 +25,10 @@
 
 int new_called = 0;
 
-int main(int, char**)
-{
-    const std::allocator<int> a;
-    std::size_t M = a.max_size();
-    assert(M > 0xFFFF && M <= (std::numeric_limits<std::size_t>::max() / sizeof(int)));
+int main(int, char**) {
+  const std::allocator<int> a;
+  std::size_t M = a.max_size();
+  assert(M > 0xFFFF && M <= (std::numeric_limits<std::size_t>::max() / sizeof(int)));
 
   return 0;
 }
diff --git a/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/max_size.cxx20.verify.cpp b/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/max_size.cxx20.verify.cpp
new file mode 100644
index 000000000000..0e0f3c3f4aa4
--- /dev/null
+++ b/libcxx/test/libcxx/depr/depr.default.allocator/allocator.members/max_size.cxx20.verify.cpp
@@ -0,0 +1,32 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <memory>
+
+// allocator:
+// size_type max_size() const throw();
+
+// In C++20, parts of std::allocator<T> have been removed.
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <memory>
+#include <limits>
+#include <cstddef>
+#include <cassert>
+
+#include "test_macros.h"
+
+int new_called = 0;
+
+int main(int, char**) {
+  const std::allocator<int> a;
+  std::size_t M = a.max_size(); // expected-error {{no member}}
+  assert(M > 0xFFFF && M <= (std::numeric_limits<std::size_t>::max() / sizeof(int)));
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/depr/depr.default.allocator/allocator_types.cxx2a.pass.cpp b/libcxx/test/libcxx/depr/depr.default.allocator/allocator_types.cxx20.pass.cpp
index a6134b04a8f5..e462e07d896c 100644
--- a/libcxx/test/libcxx/depr/depr.default.allocator/allocator_types.cxx2a.pass.cpp
+++ b/libcxx/test/libcxx/depr/depr.default.allocator/allocator_types.cxx20.pass.cpp
@@ -26,7 +26,9 @@
 // ...
 // };
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS
+// Removed in C++20, deprecated in C++17.
+
+// REQUIRES: c++03 || c++11 || c++14 || c++17
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
 
 #include <memory>
@@ -35,17 +37,17 @@
 
 template <class T>
 void test() {
-    static_assert((std::is_same<typename std::allocator<T>::size_type, std::size_t>::value), "");
-    static_assert((std::is_same<typename std::allocator<T>::difference_type, std::ptrdiff_t>::value), "");
-    static_assert((std::is_same<typename std::allocator<T>::pointer, T*>::value), "");
-    static_assert((std::is_same<typename std::allocator<T>::const_pointer, const T*>::value), "");
-    static_assert((std::is_same<typename std::allocator<T>::reference, T&>::value), "");
-    static_assert((std::is_same<typename std::allocator<T>::const_reference, const T&>::value), "");
-    static_assert((std::is_same<typename std::allocator<T>::template rebind<int>::other,
-                                std::allocator<int> >::value), "");
+  static_assert((std::is_same<typename std::allocator<T>::size_type, std::size_t>::value), "");
+  static_assert((std::is_same<typename std::allocator<T>::difference_type, std::ptrdiff_t>::value), "");
+  static_assert((std::is_same<typename std::allocator<T>::pointer, T*>::value), "");
+  static_assert((std::is_same<typename std::allocator<T>::const_pointer, const T*>::value), "");
+  static_assert((std::is_same<typename std::allocator<T>::reference, T&>::value), "");
+  static_assert((std::is_same<typename std::allocator<T>::const_reference, const T&>::value), "");
+  static_assert(
+      (std::is_same<typename std::allocator<T>::template rebind<int>::other, std::allocator<int> >::value), "");
 }
 
 int main(int, char**) {
-    test<char>();
-    return 0;
+  test<char>();
+  return 0;
 }
diff --git a/libcxx/test/libcxx/depr/depr.default.allocator/enable_removed_allocator_members.deprecated.verify.cpp b/libcxx/test/libcxx/depr/depr.default.allocator/enable_removed_allocator_members.deprecated.verify.cpp
deleted file mode 100644
index ab6495ea9db4..000000000000
--- a/libcxx/test/libcxx/depr/depr.default.allocator/enable_removed_allocator_members.deprecated.verify.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <memory>
-
-// Ensure that defining _LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS yields a
-// deprecation warning. We intend to issue a deprecation warning in LLVM 18
-// and remove the macro entirely in LLVM 19. As such, this test will be quite
-// short lived.
-
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS
-
-// UNSUPPORTED: clang-modules-build
-
-#include <memory> // expected-warning@* 1+ {{macro '_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS' has been marked as deprecated}}
diff --git a/libcxx/test/libcxx/utilities/memory/default.allocator/allocator_types.void.cxx20_allocator_void_no_members.verify.cpp b/libcxx/test/libcxx/utilities/memory/default.allocator/allocator_types.void.cxx20_allocator_void_no_members.verify.cpp
deleted file mode 100644
index 8888683a044f..000000000000
--- a/libcxx/test/libcxx/utilities/memory/default.allocator/allocator_types.void.cxx20_allocator_void_no_members.verify.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// Check that members of std::allocator<void> are not provided in C++20
-// with _LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_VOID_SPECIALIZATION but without
-// _LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS.
-
-// UNSUPPORTED: c++03, c++11, c++14, c++17
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_VOID_SPECIALIZATION
-//
-// Ignore any extra errors arising from typo correction.
-// ADDITIONAL_COMPILE_FLAGS: -Xclang -verify-ignore-unexpected=error
-
-#include <memory>
-
-std::allocator<void>::pointer x;            // expected-error-re {{no {{(type|template)}} named 'pointer'}}
-std::allocator<void>::const_pointer y;      // expected-error-re {{no {{(type|template)}} named 'const_pointer'}}
-std::allocator<void>::value_type z;         // expected-error-re {{no {{(type|template)}} named 'value_type'}}
-std::allocator<void>::rebind<int>::other t; // expected-error-re {{no {{(type|template)}} named 'rebind'}}
diff --git a/libcxx/test/libcxx/utilities/memory/default.allocator/allocator_types.void.cxx20_with_removed_members.compile.pass.cpp b/libcxx/test/libcxx/utilities/memory/default.allocator/allocator_types.void.cxx20_with_removed_members.compile.pass.cpp
deleted file mode 100644
index 3f151edefe1c..000000000000
--- a/libcxx/test/libcxx/utilities/memory/default.allocator/allocator_types.void.cxx20_with_removed_members.compile.pass.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// Check that the nested types of std::allocator<void> are provided in C++20
-// with a flag that keeps the removed members.
-
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_VOID_SPECIALIZATION
-
-#include <memory>
-#include <type_traits>
-
-static_assert((std::is_same<std::allocator<void>::pointer, void*>::value), "");
-static_assert((std::is_same<std::allocator<void>::const_pointer, const void*>::value), "");
-static_assert((std::is_same<std::allocator<void>::value_type, void>::value), "");
-static_assert((std::is_same<std::allocator<void>::rebind<int>::other, std::allocator<int> >::value), "");
diff --git a/libcxx/test/std/containers/sequences/deque/types.pass.cpp b/libcxx/test/std/containers/sequences/deque/types.pass.cpp
index bfe4808f863d..8c14de0c7744 100644
--- a/libcxx/test/std/containers/sequences/deque/types.pass.cpp
+++ b/libcxx/test/std/containers/sequences/deque/types.pass.cpp
@@ -28,9 +28,6 @@
 //     typedef std::reverse_iterator<const_iterator>    const_reverse_iterator;
 // };
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
-
 #include <deque>
 #include <iterator>
 #include <type_traits>
@@ -47,14 +44,22 @@ test()
     typedef std::deque<T, Allocator> C;
 
     static_assert((std::is_same<typename C::value_type, T>::value), "");
-    static_assert((std::is_same<typename C::value_type, typename Allocator::value_type>::value), "");
+    static_assert(
+        (std::is_same<typename C::value_type, typename std::allocator_traits<Allocator>::value_type>::value), "");
     static_assert((std::is_same<typename C::allocator_type, Allocator>::value), "");
-    static_assert((std::is_same<typename C::size_type, typename Allocator::size_type>::value), "");
-    static_assert((std::is_same<typename C::difference_type, typename Allocator::difference_type>::value), "");
-    static_assert((std::is_same<typename C::reference, typename Allocator::reference>::value), "");
-    static_assert((std::is_same<typename C::const_reference, typename Allocator::const_reference>::value), "");
-    static_assert((std::is_same<typename C::pointer, typename Allocator::pointer>::value), "");
-    static_assert((std::is_same<typename C::const_pointer, typename Allocator::const_pointer>::value), "");
+    static_assert(
+        (std::is_same<typename C::size_type, typename std::allocator_traits<Allocator>::size_type>::value), "");
+    static_assert(
+        (std::is_same<typename C::difference_type, typename std::allocator_traits<Allocator>::difference_type>::value),
+        "");
+    static_assert(
+        (std::is_same<typename C::reference, typename std::allocator_traits<Allocator>::value_type&>::value), "");
+    static_assert((std::is_same<typename C::const_reference,
+                                const typename std::allocator_traits<Allocator>::value_type&>::value),
+                  "");
+    static_assert((std::is_same<typename C::pointer, typename std::allocator_traits<Allocator>::pointer>::value), "");
+    static_assert(
+        (std::is_same<typename C::const_pointer, typename std::allocator_traits<Allocator>::const_pointer>::value), "");
     static_assert((std::is_same<
         typename std::iterator_traits<typename C::iterator>::iterator_category,
         std::random_access_iterator_tag>::value), "");
diff --git a/libcxx/test/std/containers/sequences/list/types.pass.cpp b/libcxx/test/std/containers/sequences/list/types.pass.cpp
index 0c1ca27745ce..8fe31e3949de 100644
--- a/libcxx/test/std/containers/sequences/list/types.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/types.pass.cpp
@@ -21,9 +21,6 @@
 //     typedef typename allocator_type::pointer pointer;
 //     typedef typename allocator_type::const_pointer const_pointer;
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
-
 #include <list>
 #include <type_traits>
 
@@ -38,10 +35,12 @@ int main(int, char**)
     typedef std::list<int> C;
     static_assert((std::is_same<C::value_type, int>::value), "");
     static_assert((std::is_same<C::allocator_type, std::allocator<int> >::value), "");
-    static_assert((std::is_same<C::reference, std::allocator<int>::reference>::value), "");
-    static_assert((std::is_same<C::const_reference, std::allocator<int>::const_reference>::value), "");
-    static_assert((std::is_same<C::pointer, std::allocator<int>::pointer>::value), "");
-    static_assert((std::is_same<C::const_pointer, std::allocator<int>::const_pointer>::value), "");
+    static_assert((std::is_same<C::reference, std::allocator_traits<std::allocator<int> >::value_type&>::value), "");
+    static_assert(
+        (std::is_same<C::const_reference, const std::allocator_traits<std::allocator<int> >::value_type&>::value), "");
+    static_assert((std::is_same<C::pointer, std::allocator_traits<std::allocator<int> >::pointer>::value), "");
+    static_assert(
+        (std::is_same<C::const_pointer, std::allocator_traits<std::allocator<int> >::const_pointer>::value), "");
 
     static_assert((std::is_signed<typename C::difference_type>::value), "");
     static_assert((std::is_unsigned<typename C::size_type>::value), "");
diff --git a/libcxx/test/std/containers/sequences/vector/types.pass.cpp b/libcxx/test/std/containers/sequences/vector/types.pass.cpp
index 4bcfbe7c3ea4..f4d7fa088842 100644
--- a/libcxx/test/std/containers/sequences/vector/types.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector/types.pass.cpp
@@ -28,9 +28,6 @@
 //     typedef std::reverse_iterator<const_iterator>    const_reverse_iterator;
 // };
 
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
-
 #include <vector>
 #include <iterator>
 #include <type_traits>
@@ -52,14 +49,22 @@ test()
 //  blindly pulling typedefs out of the allocator. This is why we can't call
 //  test<int, min_allocator<int>>() below.
     static_assert((std::is_same<typename C::value_type, T>::value), "");
-    static_assert((std::is_same<typename C::value_type, typename Allocator::value_type>::value), "");
+    static_assert(
+        (std::is_same<typename C::value_type, typename std::allocator_traits<Allocator>::value_type>::value), "");
     static_assert((std::is_same<typename C::allocator_type, Allocator>::value), "");
-    static_assert((std::is_same<typename C::size_type, typename Allocator::size_type>::value), "");
-    static_assert((std::is_same<typename C::difference_type, typename Allocator::difference_type>::value), "");
-    static_assert((std::is_same<typename C::reference, typename Allocator::reference>::value), "");
-    static_assert((std::is_same<typename C::const_reference, typename Allocator::const_reference>::value), "");
-    static_assert((std::is_same<typename C::pointer, typename Allocator::pointer>::value), "");
-    static_assert((std::is_same<typename C::const_pointer, typename Allocator::const_pointer>::value), "");
+    static_assert(
+        (std::is_same<typename C::size_type, typename std::allocator_traits<Allocator>::size_type>::value), "");
+    static_assert(
+        (std::is_same<typename C::difference_type, typename std::allocator_traits<Allocator>::difference_type>::value),
+        "");
+    static_assert(
+        (std::is_same<typename C::reference, typename std::allocator_traits<Allocator>::value_type&>::value), "");
+    static_assert((std::is_same<typename C::const_reference,
+                                const typename std::allocator_traits<Allocator>::value_type&>::value),
+                  "");
+    static_assert((std::is_same<typename C::pointer, typename std::allocator_traits<Allocator>::pointer>::value), "");
+    static_assert(
+        (std::is_same<typename C::const_pointer, typename std::allocator_traits<Allocator>::const_pointer>::value), "");
 
     static_assert((std::is_signed<typename C::difference_type>::value), "");
     static_assert((std::is_unsigned<typename C::size_type>::value), "");
diff --git a/libcxx/test/std/depr.cerro/cerrno.syn.verify.cpp b/libcxx/test/std/depr.cerro/cerrno.syn.verify.cpp
new file mode 100644
index 000000000000..3a38605570da
--- /dev/null
+++ b/libcxx/test/std/depr.cerro/cerrno.syn.verify.cpp
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: clang-modules-build
+// UNSUPPORTED: apple-clang && c++03
+
+// <cerrno>
+
+// tests LWG 3869 deprecated macros.
+//
+// Note the macros may not be defined. When they are not defined the
+// ifdef XXX does not trigger a deprecated message. So use them in the
+// ifdef and test for 2 deprecated messages.
+
+#include <cerrno>
+
+#ifdef ENODATA
+[[maybe_unused]] int nodata =
+    ENODATA; // expected-warning@cerrno.syn.verify.cpp:* 2 {{macro 'ENODATA' has been marked as deprecated}}
+#endif
+#ifdef ENOSR
+[[maybe_unused]] int nosr =
+    ENOSR; // expected-warning@cerrno.syn.verify.cpp:* 2 {{macro 'ENOSR' has been marked as deprecated}}
+#endif
+#ifdef ENOSTR
+[[maybe_unused]] int nostr =
+    ENOSTR; // expected-warning@cerrno.syn.verify.cpp:* 2 {{macro 'ENOSTR' has been marked as deprecated}}
+#endif
+#ifdef ETIME
+[[maybe_unused]] int timeout =
+    ETIME; // expected-warning@cerrno.syn.verify.cpp:* 2 {{macro 'ETIME' has been marked as deprecated}}
+#endif
diff --git a/libcxx/test/std/depr.cerro/system.error.syn.verify.cpp b/libcxx/test/std/depr.cerro/system.error.syn.verify.cpp
new file mode 100644
index 000000000000..fab5dd5b5593
--- /dev/null
+++ b/libcxx/test/std/depr.cerro/system.error.syn.verify.cpp
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// These macros do not seem to behave as expected on all Apple platforms.
+// Since the macros are not provided newer POSIX versions it is expected the
+// macros will be retroactively removed from C++. (The deprecation was
+// retroactively.)
+// UNSUPPORTED: apple-clang && (c++03 || clang-modules-build)
+
+// <system_error>
+
+// enum errc {...}
+
+// tests LWG 3869 deprecated enum members.
+
+#include <system_error>
+
+[[maybe_unused]] std::errc nodata =
+    std::errc::no_message_available; // expected-warning {{'no_message_available' is deprecated}}
+[[maybe_unused]] std::errc nosr =
+    std::errc::no_stream_resources; // expected-warning {{'no_stream_resources' is deprecated}}
+[[maybe_unused]] std::errc nostr   = std::errc::not_a_stream;   // expected-warning {{'not_a_stream' is deprecated}}
+[[maybe_unused]] std::errc timeout = std::errc::stream_timeout; // expected-warning {{'stream_timeout' is deprecated}}
diff --git a/libcxx/test/std/diagnostics/syserr/errc.pass.cpp b/libcxx/test/std/diagnostics/syserr/errc.pass.cpp
index e44cb50102e3..4abee08ddc66 100644
--- a/libcxx/test/std/diagnostics/syserr/errc.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/errc.pass.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // <system_error>
 
 // enum errc {...}
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 36248925d65a..919a14b8bcf0 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -612,7 +612,7 @@ static void replaceCommonSymbols() {
     if (!osec)
       osec = ConcatOutputSection::getOrCreateForInput(isec);
     isec->parent = osec;
-    inputSections.push_back(isec);
+    addInputSection(isec);
 
     // FIXME: CommonSymbol should store isReferencedDynamically, noDeadStrip
     // and pass them on here.
@@ -1220,53 +1220,18 @@ static void createFiles(const InputArgList &args) {
 
 static void gatherInputSections() {
   TimeTraceScope timeScope("Gathering input sections");
-  int inputOrder = 0;
   for (const InputFile *file : inputFiles) {
     for (const Section *section : file->sections) {
       // Compact unwind entries require special handling elsewhere. (In
       // contrast, EH frames are handled like regular ConcatInputSections.)
       if (section->name == section_names::compactUnwind)
         continue;
-      ConcatOutputSection *osec = nullptr;
-      for (const Subsection &subsection : section->subsections) {
-        if (auto *isec = dyn_cast<ConcatInputSection>(subsection.isec)) {
-          if (isec->isCoalescedWeak())
-            continue;
-          if (config->emitInitOffsets &&
-              sectionType(isec->getFlags()) == S_MOD_INIT_FUNC_POINTERS) {
-            in.initOffsets->addInput(isec);
-            continue;
-          }
-          isec->outSecOff = inputOrder++;
-          if (!osec)
-            osec = ConcatOutputSection::getOrCreateForInput(isec);
-          isec->parent = osec;
-          inputSections.push_back(isec);
-        } else if (auto *isec =
-                       dyn_cast<CStringInputSection>(subsection.isec)) {
-          if (isec->getName() == section_names::objcMethname) {
-            if (in.objcMethnameSection->inputOrder == UnspecifiedInputOrder)
-              in.objcMethnameSection->inputOrder = inputOrder++;
-            in.objcMethnameSection->addInput(isec);
-          } else {
-            if (in.cStringSection->inputOrder == UnspecifiedInputOrder)
-              in.cStringSection->inputOrder = inputOrder++;
-            in.cStringSection->addInput(isec);
-          }
-        } else if (auto *isec =
-                       dyn_cast<WordLiteralInputSection>(subsection.isec)) {
-          if (in.wordLiteralSection->inputOrder == UnspecifiedInputOrder)
-            in.wordLiteralSection->inputOrder = inputOrder++;
-          in.wordLiteralSection->addInput(isec);
-        } else {
-          llvm_unreachable("unexpected input section kind");
-        }
-      }
+      for (const Subsection &subsection : section->subsections)
+        addInputSection(subsection.isec);
     }
     if (!file->objCImageInfo.empty())
       in.objCImageInfo->addFile(file);
   }
-  assert(inputOrder <= UnspecifiedInputOrder);
 }
 
 static void foldIdenticalLiterals() {
@@ -1422,6 +1387,7 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     concatOutputSections.clear();
     inputFiles.clear();
     inputSections.clear();
+    inputSectionsOrder = 0;
     loadedArchives.clear();
     loadedObjectFrameworks.clear();
     missingAutolinkWarnings.clear();
diff --git a/lld/MachO/InputSection.cpp b/lld/MachO/InputSection.cpp
index 8f5affb1dc21..22930d52dd1d 100644
--- a/lld/MachO/InputSection.cpp
+++ b/lld/MachO/InputSection.cpp
@@ -37,6 +37,44 @@ static_assert(sizeof(void *) != 8 ||
               "instances of it");
 
 std::vector<ConcatInputSection *> macho::inputSections;
+int macho::inputSectionsOrder = 0;
+
+// Call this function to add a new InputSection and have it routed to the
+// appropriate container. Depending on its type and current config, it will
+// either be added to 'inputSections' vector or to a synthetic section.
+void lld::macho::addInputSection(InputSection *inputSection) {
+  if (auto *isec = dyn_cast<ConcatInputSection>(inputSection)) {
+    if (isec->isCoalescedWeak())
+      return;
+    if (config->emitInitOffsets &&
+        sectionType(isec->getFlags()) == S_MOD_INIT_FUNC_POINTERS) {
+      in.initOffsets->addInput(isec);
+      return;
+    }
+    isec->outSecOff = inputSectionsOrder++;
+    auto *osec = ConcatOutputSection::getOrCreateForInput(isec);
+    isec->parent = osec;
+    inputSections.push_back(isec);
+  } else if (auto *isec = dyn_cast<CStringInputSection>(inputSection)) {
+    if (isec->getName() == section_names::objcMethname) {
+      if (in.objcMethnameSection->inputOrder == UnspecifiedInputOrder)
+        in.objcMethnameSection->inputOrder = inputSectionsOrder++;
+      in.objcMethnameSection->addInput(isec);
+    } else {
+      if (in.cStringSection->inputOrder == UnspecifiedInputOrder)
+        in.cStringSection->inputOrder = inputSectionsOrder++;
+      in.cStringSection->addInput(isec);
+    }
+  } else if (auto *isec = dyn_cast<WordLiteralInputSection>(inputSection)) {
+    if (in.wordLiteralSection->inputOrder == UnspecifiedInputOrder)
+      in.wordLiteralSection->inputOrder = inputSectionsOrder++;
+    in.wordLiteralSection->addInput(isec);
+  } else {
+    llvm_unreachable("unexpected input section kind");
+  }
+
+  assert(inputSectionsOrder <= UnspecifiedInputOrder);
+}
 
 uint64_t InputSection::getFileSize() const {
   return isZeroFill(getFlags()) ? 0 : getSize();
diff --git a/lld/MachO/InputSection.h b/lld/MachO/InputSection.h
index b25f0638f4c6..694bdf734907 100644
--- a/lld/MachO/InputSection.h
+++ b/lld/MachO/InputSection.h
@@ -302,6 +302,8 @@ bool isEhFrameSection(const InputSection *);
 bool isGccExceptTabSection(const InputSection *);
 
 extern std::vector<ConcatInputSection *> inputSections;
+// This is used as a counter for specyfing input order for input sections
+extern int inputSectionsOrder;
 
 namespace section_names {
 
@@ -369,6 +371,7 @@ constexpr const char addrSig[] = "__llvm_addrsig";
 
 } // namespace section_names
 
+void addInputSection(InputSection *inputSection);
 } // namespace macho
 
 std::string toString(const macho::InputSection *);
diff --git a/lld/MachO/ObjC.cpp b/lld/MachO/ObjC.cpp
index 40df2243b26f..5902b82d30f5 100644
--- a/lld/MachO/ObjC.cpp
+++ b/lld/MachO/ObjC.cpp
@@ -790,7 +790,7 @@ void ObjcCategoryMerger::emitAndLinkProtocolList(
       infoCategoryWriter.catPtrListInfo.align);
   listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection;
   listSec->live = true;
-  allInputSections.push_back(listSec);
+  addInputSection(listSec);
 
   listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection;
 
@@ -848,7 +848,7 @@ void ObjcCategoryMerger::emitAndLinkPointerList(
       infoCategoryWriter.catPtrListInfo.align);
   listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection;
   listSec->live = true;
-  allInputSections.push_back(listSec);
+  addInputSection(listSec);
 
   listSec->parent = infoCategoryWriter.catPtrListInfo.outputSection;
 
@@ -889,7 +889,7 @@ ObjcCategoryMerger::emitCatListEntrySec(const std::string &forCateogryName,
                                bodyData, infoCategoryWriter.catListInfo.align);
   newCatList->parent = infoCategoryWriter.catListInfo.outputSection;
   newCatList->live = true;
-  allInputSections.push_back(newCatList);
+  addInputSection(newCatList);
 
   newCatList->parent = infoCategoryWriter.catListInfo.outputSection;
 
@@ -927,7 +927,7 @@ Defined *ObjcCategoryMerger::emitCategoryBody(const std::string &name,
                                bodyData, infoCategoryWriter.catBodyInfo.align);
   newBodySec->parent = infoCategoryWriter.catBodyInfo.outputSection;
   newBodySec->live = true;
-  allInputSections.push_back(newBodySec);
+  addInputSection(newBodySec);
 
   std::string symName =
       objc::symbol_names::category + baseClassName + "_$_(" + name + ")";
@@ -1132,7 +1132,7 @@ void ObjcCategoryMerger::generateCatListForNonErasedCategories(
           infoCategoryWriter.catListInfo.align);
       listSec->parent = infoCategoryWriter.catListInfo.outputSection;
       listSec->live = true;
-      allInputSections.push_back(listSec);
+      addInputSection(listSec);
 
       std::string slotSymName = "<__objc_catlist slot for category ";
       slotSymName += nonErasedCatBody->getName();
@@ -1221,9 +1221,11 @@ void ObjcCategoryMerger::doCleanup() { generatedSectionData.clear(); }
 
 StringRef ObjcCategoryMerger::newStringData(const char *str) {
   uint32_t len = strlen(str);
-  auto &data = newSectionData(len + 1);
+  uint32_t bufSize = len + 1;
+  auto &data = newSectionData(bufSize);
   char *strData = reinterpret_cast<char *>(data.data());
-  strncpy(strData, str, len);
+  // Copy the string chars and null-terminator
+  memcpy(strData, str, bufSize);
   return StringRef(strData, len);
 }
 
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index 7ee3261ce307..1b3694528de1 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -793,7 +793,7 @@ void StubHelperSection::setUp() {
 
   in.imageLoaderCache->parent =
       ConcatOutputSection::getOrCreateForInput(in.imageLoaderCache);
-  inputSections.push_back(in.imageLoaderCache);
+  addInputSection(in.imageLoaderCache);
   // Since this isn't in the symbol table or in any input file, the noDeadStrip
   // argument doesn't matter.
   dyldPrivate =
@@ -855,7 +855,7 @@ ConcatInputSection *ObjCSelRefsSection::makeSelRef(StringRef methname) {
                                 /*addend=*/static_cast<int64_t>(methnameOffset),
                                 /*referent=*/in.objcMethnameSection->isec});
   objcSelref->parent = ConcatOutputSection::getOrCreateForInput(objcSelref);
-  inputSections.push_back(objcSelref);
+  addInputSection(objcSelref);
   objcSelref->isFinal = true;
   methnameToSelref[CachedHashStringRef(methname)] = objcSelref;
   return objcSelref;
diff --git a/lld/MinGW/Driver.cpp b/lld/MinGW/Driver.cpp
index efd643f9a322..bb08c77b2e11 100644
--- a/lld/MinGW/Driver.cpp
+++ b/lld/MinGW/Driver.cpp
@@ -455,6 +455,8 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     add("-lldemit:llvm");
   if (args.hasArg(OPT_lto_emit_asm))
     add("-lldemit:asm");
+  if (auto *arg = args.getLastArg(OPT_lto_sample_profile))
+    add("-lto-sample-profile:" + StringRef(arg->getValue()));
 
   if (auto *a = args.getLastArg(OPT_thinlto_cache_dir))
     add("-lldltocache:" + StringRef(a->getValue()));
diff --git a/lld/MinGW/Options.td b/lld/MinGW/Options.td
index 9a0a96aac7f1..56f67e3dd96c 100644
--- a/lld/MinGW/Options.td
+++ b/lld/MinGW/Options.td
@@ -160,6 +160,8 @@ def lto_cs_profile_file: JJ<"lto-cs-profile-file=">,
   HelpText<"Context sensitive profile file path">;
 def lto_emit_asm: FF<"lto-emit-asm">,
   HelpText<"Emit assembly code">;
+def lto_sample_profile: JJ<"lto-sample-profile=">,
+  HelpText<"Sample profile file path">;
 
 def thinlto_cache_dir: JJ<"thinlto-cache-dir=">,
   HelpText<"Path to ThinLTO cached object file directory">;
diff --git a/lld/test/MinGW/driver.test b/lld/test/MinGW/driver.test
index a4e9e5e1b19b..619fee8dee7c 100644
--- a/lld/test/MinGW/driver.test
+++ b/lld/test/MinGW/driver.test
@@ -422,6 +422,9 @@ LTO_EMIT_ASM: -lldemit:asm
 RUN: ld.lld -### foo.o -m i386pe -plugin-opt=emit-llvm 2>&1 | FileCheck -check-prefix=LTO_EMIT_LLVM %s
 LTO_EMIT_LLVM: -lldemit:llvm
 
+RUN: ld.lld -### foo.o -m i386pep --lto-sample-profile=foo 2>&1 | FileCheck -check-prefix=LTO_SAMPLE_PROFILE %s
+LTO_SAMPLE_PROFILE: -lto-sample-profile:foo
+
 Test GCC specific LTO options that GCC passes unconditionally, that we ignore.
 
 RUN: ld.lld -### foo.o -m i386pep -plugin /usr/lib/gcc/x86_64-w64-mingw32/10-posix/liblto_plugin.so -plugin-opt=/usr/lib/gcc/x86_64-w64-mingw32/10-posix/lto-wrapper -plugin-opt=-fresolution=/tmp/ccM9d4fP.res -plugin-opt=-pass-through=-lmingw32 2> /dev/null
diff --git a/lldb/include/lldb/Core/Disassembler.h b/lldb/include/lldb/Core/Disassembler.h
index 885ac1bb4a7e..e037a49f152c 100644
--- a/lldb/include/lldb/Core/Disassembler.h
+++ b/lldb/include/lldb/Core/Disassembler.h
@@ -538,7 +538,7 @@ protected:
   ElideMixedSourceAndDisassemblyLine(const ExecutionContext &exe_ctx,
                                      const SymbolContext &sc, LineEntry &line) {
     SourceLine sl;
-    sl.file = line.file;
+    sl.file = line.GetFile();
     sl.line = line.line;
     sl.column = line.column;
     return ElideMixedSourceAndDisassemblyLine(exe_ctx, sc, sl);
diff --git a/lldb/include/lldb/Core/Progress.h b/lldb/include/lldb/Core/Progress.h
index c38f6dd0a140..eb3af9816dc6 100644
--- a/lldb/include/lldb/Core/Progress.h
+++ b/lldb/include/lldb/Core/Progress.h
@@ -66,7 +66,11 @@ public:
   /// @param [in] title The title of this progress activity.
   ///
   /// @param [in] details Specific information about what the progress report
-  /// is currently working on.
+  /// is currently working on. Although not required, if the progress report is
+  /// updated with Progress::Increment() then this field will be overwritten
+  /// with the new set of details passed into that function, and the details
+  /// passed initially will act as an "item 0" for the total set of
+  /// items being reported on.
   ///
   /// @param [in] total The total units of work to be done if specified, if
   /// set to std::nullopt then an indeterminate progress indicator should be
diff --git a/lldb/include/lldb/Symbol/LineEntry.h b/lldb/include/lldb/Symbol/LineEntry.h
index 31e1cd0b36f9..8da59cf0bd24 100644
--- a/lldb/include/lldb/Symbol/LineEntry.h
+++ b/lldb/include/lldb/Symbol/LineEntry.h
@@ -130,11 +130,14 @@ struct LineEntry {
   ///     Shared pointer to the target this LineEntry belongs to.
   void ApplyFileMappings(lldb::TargetSP target_sp);
 
+  /// Helper to access the file.
+  const FileSpec &GetFile() const { return file_sp->GetSpecOnly(); }
+
   /// The section offset address range for this line entry.
   AddressRange range;
 
   /// The source file, possibly mapped by the target.source-map setting.
-  FileSpec file;
+  lldb::SupportFileSP file_sp;
 
   /// The original source file, from debug info.
   lldb::SupportFileSP original_file_sp;
diff --git a/lldb/include/lldb/Utility/SupportFile.h b/lldb/include/lldb/Utility/SupportFile.h
index 0ea0ca4e7c97..7505d7f345c5 100644
--- a/lldb/include/lldb/Utility/SupportFile.h
+++ b/lldb/include/lldb/Utility/SupportFile.h
@@ -45,6 +45,9 @@ public:
   /// Materialize the file to disk and return the path to that temporary file.
   virtual const FileSpec &Materialize() { return m_file_spec; }
 
+  /// Change the file name.
+  void Update(const FileSpec &file_spec) { m_file_spec = file_spec; }
+
 protected:
   FileSpec m_file_spec;
   Checksum m_checksum;
diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index bfd249ccd43f..75efcde1f040 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -51,7 +51,7 @@ LLDB_BASE_DIR := $(THIS_FILE_DIR)/../../../../../
 #
 # GNUWin32 uname gives "windows32" or "server version windows32" while
 # some versions of MSYS uname return "MSYS_NT*", but most environments
-# standardize on "Windows_NT", so we'll make it consistent here. 
+# standardize on "Windows_NT", so we'll make it consistent here.
 # When running tests from Visual Studio, the environment variable isn't
 # inherited all the way down to the process spawned for make.
 #----------------------------------------------------------------------
@@ -210,6 +210,12 @@ else
 	ifeq "$(SPLIT_DEBUG_SYMBOLS)" "YES"
 		DSYM = $(EXE).debug
 	endif
+
+	ifeq "$(MAKE_DWP)" "YES"
+		MAKE_DWO := YES
+		DWP_NAME = $(EXE).dwp
+		DYLIB_DWP_NAME = $(DYLIB_NAME).dwp
+	endif
 endif
 
 LIMIT_DEBUG_INFO_FLAGS =
@@ -357,6 +363,7 @@ ifneq "$(OS)" "Darwin"
 
 	OBJCOPY ?= $(call replace_cc_with,objcopy)
 	ARCHIVER ?= $(call replace_cc_with,ar)
+	DWP ?= $(call replace_cc_with,dwp)
 	override AR = $(ARCHIVER)
 endif
 
@@ -527,6 +534,10 @@ ifneq "$(CXX)" ""
 	endif
 endif
 
+ifeq "$(GEN_GNU_BUILD_ID)" "YES"
+	LDFLAGS += -Wl,--build-id
+endif
+
 #----------------------------------------------------------------------
 # DYLIB_ONLY variable can be used to skip the building of a.out.
 # See the sections below regarding dSYM file as well as the building of
@@ -565,11 +576,25 @@ else
 endif
 else
 ifeq "$(SPLIT_DEBUG_SYMBOLS)" "YES"
+ifeq "$(SAVE_FULL_DEBUG_BINARY)" "YES"
+	cp "$(EXE)" "$(EXE).full"
+endif
 	$(OBJCOPY) --only-keep-debug "$(EXE)" "$(DSYM)"
 	$(OBJCOPY) --strip-debug --add-gnu-debuglink="$(DSYM)" "$(EXE)" "$(EXE)"
 endif
+ifeq "$(MAKE_DWP)" "YES"
+	$(DWP) -o "$(DWP_NAME)" $(DWOS)
+endif
 endif
 
+
+#----------------------------------------------------------------------
+# Support emitting the content of the GNU build-id into a file
+# This needs to be used in conjunction with GEN_GNU_BUILD_ID := YES
+#----------------------------------------------------------------------
+$(EXE).uuid : $(EXE)
+	$(OBJCOPY) --dump-section=.note.gnu.build-id=$@ $<
+
 #----------------------------------------------------------------------
 # Make the dylib
 #----------------------------------------------------------------------
@@ -610,9 +635,15 @@ endif
 else
 	$(LD) $(DYLIB_OBJECTS) $(LDFLAGS) -shared -o "$(DYLIB_FILENAME)"
 ifeq "$(SPLIT_DEBUG_SYMBOLS)" "YES"
+	ifeq "$(SAVE_FULL_DEBUG_BINARY)" "YES"
+	cp "$(DYLIB_FILENAME)" "$(DYLIB_FILENAME).full"
+	endif
 	$(OBJCOPY) --only-keep-debug "$(DYLIB_FILENAME)" "$(DYLIB_FILENAME).debug"
 	$(OBJCOPY) --strip-debug --add-gnu-debuglink="$(DYLIB_FILENAME).debug" "$(DYLIB_FILENAME)" "$(DYLIB_FILENAME)"
 endif
+ifeq "$(MAKE_DWP)" "YES"
+	$(DWP) -o $(DYLIB_DWP_FILE) $(DYLIB_DWOS)
+endif
 endif
 
 #----------------------------------------------------------------------
diff --git a/lldb/source/API/SBLineEntry.cpp b/lldb/source/API/SBLineEntry.cpp
index 28d12e65fdaf..99a7b8fe644c 100644
--- a/lldb/source/API/SBLineEntry.cpp
+++ b/lldb/source/API/SBLineEntry.cpp
@@ -81,8 +81,8 @@ SBFileSpec SBLineEntry::GetFileSpec() const {
   LLDB_INSTRUMENT_VA(this);
 
   SBFileSpec sb_file_spec;
-  if (m_opaque_up.get() && m_opaque_up->file)
-    sb_file_spec.SetFileSpec(m_opaque_up->file);
+  if (m_opaque_up.get() && m_opaque_up->GetFile())
+    sb_file_spec.SetFileSpec(m_opaque_up->GetFile());
 
   return sb_file_spec;
 }
@@ -109,9 +109,9 @@ void SBLineEntry::SetFileSpec(lldb::SBFileSpec filespec) {
   LLDB_INSTRUMENT_VA(this, filespec);
 
   if (filespec.IsValid())
-    ref().file = filespec.ref();
+    ref().file_sp = std::make_shared<SupportFile>(filespec.ref());
   else
-    ref().file.Clear();
+    ref().file_sp = std::make_shared<SupportFile>();
 }
 void SBLineEntry::SetLine(uint32_t line) {
   LLDB_INSTRUMENT_VA(this, line);
@@ -168,7 +168,7 @@ bool SBLineEntry::GetDescription(SBStream &description) {
 
   if (m_opaque_up) {
     char file_path[PATH_MAX * 2];
-    m_opaque_up->file.GetPath(file_path, sizeof(file_path));
+    m_opaque_up->GetFile().GetPath(file_path, sizeof(file_path));
     strm.Printf("%s:%u", file_path, GetLine());
     if (GetColumn() > 0)
       strm.Printf(":%u", GetColumn());
diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp
index fa4c80e59d97..eb9cf063802c 100644
--- a/lldb/source/API/SBThread.cpp
+++ b/lldb/source/API/SBThread.cpp
@@ -819,7 +819,7 @@ SBError SBThread::StepOverUntil(lldb::SBFrame &sb_frame,
       step_file_spec = sb_file_spec.ref();
     } else {
       if (frame_sc.line_entry.IsValid())
-        step_file_spec = frame_sc.line_entry.file;
+        step_file_spec = frame_sc.line_entry.GetFile();
       else {
         sb_error.SetErrorString("invalid file argument or no file for frame");
         return sb_error;
diff --git a/lldb/source/Breakpoint/BreakpointResolver.cpp b/lldb/source/Breakpoint/BreakpointResolver.cpp
index 1861a0fe7c4f..ff4e2a998519 100644
--- a/lldb/source/Breakpoint/BreakpointResolver.cpp
+++ b/lldb/source/Breakpoint/BreakpointResolver.cpp
@@ -221,7 +221,7 @@ void BreakpointResolver::SetSCMatchesByLine(
     auto &match = all_scs[0];
     auto worklist_begin = std::partition(
         all_scs.begin(), all_scs.end(), [&](const SymbolContext &sc) {
-          if (sc.line_entry.file == match.line_entry.file ||
+          if (sc.line_entry.GetFile() == match.line_entry.GetFile() ||
               *sc.line_entry.original_file_sp ==
                   *match.line_entry.original_file_sp) {
             // When a match is found, keep track of the smallest line number.
diff --git a/lldb/source/Breakpoint/BreakpointResolverFileLine.cpp b/lldb/source/Breakpoint/BreakpointResolverFileLine.cpp
index cc4e1d26724f..d7d8c714867e 100644
--- a/lldb/source/Breakpoint/BreakpointResolverFileLine.cpp
+++ b/lldb/source/Breakpoint/BreakpointResolverFileLine.cpp
@@ -147,8 +147,9 @@ void BreakpointResolverFileLine::FilterContexts(SymbolContextList &sc_list) {
     else
       continue;
 
-    if (file != sc.line_entry.file) {
-      LLDB_LOG(log, "unexpected symbol context file {0}", sc.line_entry.file);
+    if (file != sc.line_entry.GetFile()) {
+      LLDB_LOG(log, "unexpected symbol context file {0}",
+               sc.line_entry.GetFile());
       continue;
     }
 
@@ -223,7 +224,7 @@ void BreakpointResolverFileLine::DeduceSourceMapping(
 
   const bool case_sensitive = request_file.IsCaseSensitive();
   for (const SymbolContext &sc : sc_list) {
-    FileSpec sc_file = sc.line_entry.file;
+    FileSpec sc_file = sc.line_entry.GetFile();
 
     if (FileSpec::Equal(sc_file, request_file, /*full*/ true))
       continue;
diff --git a/lldb/source/Commands/CommandObjectBreakpoint.cpp b/lldb/source/Commands/CommandObjectBreakpoint.cpp
index fbece865f113..cd4c7790f447 100644
--- a/lldb/source/Commands/CommandObjectBreakpoint.cpp
+++ b/lldb/source/Commands/CommandObjectBreakpoint.cpp
@@ -780,8 +780,8 @@ private:
       } else {
         const SymbolContext &sc =
             cur_frame->GetSymbolContext(eSymbolContextLineEntry);
-        if (sc.line_entry.file) {
-          file = sc.line_entry.file;
+        if (sc.line_entry.GetFile()) {
+          file = sc.line_entry.GetFile();
         } else {
           result.AppendError("Can't find the file for the selected frame to "
                              "use as the default file.");
diff --git a/lldb/source/Commands/CommandObjectSource.cpp b/lldb/source/Commands/CommandObjectSource.cpp
index fde74f02aea6..0c1267456a18 100644
--- a/lldb/source/Commands/CommandObjectSource.cpp
+++ b/lldb/source/Commands/CommandObjectSource.cpp
@@ -158,7 +158,7 @@ protected:
         if (module_list.GetSize() &&
             module_list.GetIndexForModule(module) == LLDB_INVALID_INDEX32)
           continue;
-        if (!FileSpec::Match(file_spec, line_entry.file))
+        if (!FileSpec::Match(file_spec, line_entry.GetFile()))
           continue;
         if (start_line > 0 && line_entry.line < start_line)
           continue;
@@ -239,7 +239,7 @@ protected:
             num_matches++;
             if (num_lines > 0 && num_matches > num_lines)
               break;
-            assert(cu_file_spec == line_entry.file);
+            assert(cu_file_spec == line_entry.GetFile());
             if (!cu_header_printed) {
               if (num_matches > 0)
                 strm << "\n\n";
@@ -760,11 +760,11 @@ protected:
     bool operator<(const SourceInfo &rhs) const {
       if (function.GetCString() < rhs.function.GetCString())
         return true;
-      if (line_entry.file.GetDirectory().GetCString() <
-          rhs.line_entry.file.GetDirectory().GetCString())
+      if (line_entry.GetFile().GetDirectory().GetCString() <
+          rhs.line_entry.GetFile().GetDirectory().GetCString())
         return true;
-      if (line_entry.file.GetFilename().GetCString() <
-          rhs.line_entry.file.GetFilename().GetCString())
+      if (line_entry.GetFile().GetFilename().GetCString() <
+          rhs.line_entry.GetFile().GetFilename().GetCString())
         return true;
       if (line_entry.line < rhs.line_entry.line)
         return true;
@@ -799,7 +799,7 @@ protected:
         sc.function->GetEndLineSourceInfo(end_file, end_line);
       } else {
         // We have an inlined function
-        start_file = source_info.line_entry.file;
+        start_file = source_info.line_entry.GetFile();
         start_line = source_info.line_entry.line;
         end_line = start_line + m_options.num_lines;
       }
diff --git a/lldb/source/Commands/CommandObjectThread.cpp b/lldb/source/Commands/CommandObjectThread.cpp
index cf4f8ccaa0c4..3dbbfd4f9d34 100644
--- a/lldb/source/Commands/CommandObjectThread.cpp
+++ b/lldb/source/Commands/CommandObjectThread.cpp
@@ -1705,7 +1705,7 @@ protected:
         line = sym_ctx.line_entry.line + m_options.m_line_offset;
 
       // Try the current file, but override if asked.
-      FileSpec file = sym_ctx.line_entry.file;
+      FileSpec file = sym_ctx.line_entry.GetFile();
       if (m_options.m_filenames.GetSize() == 1)
         file = m_options.m_filenames.GetFileSpecAtIndex(0);
 
diff --git a/lldb/source/Core/Address.cpp b/lldb/source/Core/Address.cpp
index 6f5c366ab38a..b23398883fa5 100644
--- a/lldb/source/Core/Address.cpp
+++ b/lldb/source/Core/Address.cpp
@@ -398,7 +398,7 @@ bool Address::GetDescription(Stream &s, Target &target,
          "Non-brief descriptions not implemented");
   LineEntry line_entry;
   if (CalculateSymbolContextLineEntry(line_entry)) {
-    s.Printf(" (%s:%u:%u)", line_entry.file.GetFilename().GetCString(),
+    s.Printf(" (%s:%u:%u)", line_entry.GetFile().GetFilename().GetCString(),
              line_entry.line, line_entry.column);
     return true;
   }
diff --git a/lldb/source/Core/Disassembler.cpp b/lldb/source/Core/Disassembler.cpp
index 7b07fcb26813..e31746fa0b8b 100644
--- a/lldb/source/Core/Disassembler.cpp
+++ b/lldb/source/Core/Disassembler.cpp
@@ -201,7 +201,7 @@ Disassembler::GetFunctionDeclLineEntry(const SymbolContext &sc) {
   uint32_t func_decl_line;
   sc.function->GetStartLineSourceInfo(func_decl_file, func_decl_line);
 
-  if (func_decl_file != prologue_end_line.file &&
+  if (func_decl_file != prologue_end_line.GetFile() &&
       func_decl_file != prologue_end_line.original_file_sp->GetSpecOnly())
     return {};
 
@@ -354,7 +354,7 @@ void Disassembler::PrintInstructions(Debugger &debugger, const ArchSpec &arch,
             }
             if (sc.line_entry.IsValid()) {
               SourceLine this_line;
-              this_line.file = sc.line_entry.file;
+              this_line.file = sc.line_entry.GetFile();
               this_line.line = sc.line_entry.line;
               this_line.column = sc.line_entry.column;
               if (!ElideMixedSourceAndDisassemblyLine(exe_ctx, sc, this_line))
@@ -406,7 +406,7 @@ void Disassembler::PrintInstructions(Debugger &debugger, const ArchSpec &arch,
                   uint32_t func_decl_line;
                   sc.function->GetStartLineSourceInfo(func_decl_file,
                                                       func_decl_line);
-                  if (func_decl_file == prologue_end_line.file ||
+                  if (func_decl_file == prologue_end_line.GetFile() ||
                       func_decl_file ==
                           prologue_end_line.original_file_sp->GetSpecOnly()) {
                     // Add all the lines between the function declaration and
@@ -439,7 +439,7 @@ void Disassembler::PrintInstructions(Debugger &debugger, const ArchSpec &arch,
 
               if (sc != prev_sc && sc.comp_unit && sc.line_entry.IsValid()) {
                 SourceLine this_line;
-                this_line.file = sc.line_entry.file;
+                this_line.file = sc.line_entry.GetFile();
                 this_line.line = sc.line_entry.line;
 
                 if (!ElideMixedSourceAndDisassemblyLine(exe_ctx, sc,
diff --git a/lldb/source/Core/FormatEntity.cpp b/lldb/source/Core/FormatEntity.cpp
index cf82676bedda..ba62e2625259 100644
--- a/lldb/source/Core/FormatEntity.cpp
+++ b/lldb/source/Core/FormatEntity.cpp
@@ -1792,7 +1792,7 @@ bool FormatEntity::Format(const Entry &entry, Stream &s,
     if (sc && sc->line_entry.IsValid()) {
       Module *module = sc->module_sp.get();
       if (module) {
-        if (DumpFile(s, sc->line_entry.file, (FileKind)entry.number))
+        if (DumpFile(s, sc->line_entry.GetFile(), (FileKind)entry.number))
           return true;
       }
     }
diff --git a/lldb/source/Core/IOHandlerCursesGUI.cpp b/lldb/source/Core/IOHandlerCursesGUI.cpp
index f86dce247135..d922d32f9105 100644
--- a/lldb/source/Core/IOHandlerCursesGUI.cpp
+++ b/lldb/source/Core/IOHandlerCursesGUI.cpp
@@ -6894,7 +6894,8 @@ public:
           if (context_changed)
             m_selected_line = m_pc_line;
 
-          if (m_file_sp && m_file_sp->GetFileSpec() == m_sc.line_entry.file) {
+          if (m_file_sp &&
+              m_file_sp->GetFileSpec() == m_sc.line_entry.GetFile()) {
             // Same file, nothing to do, we should either have the lines or
             // not (source file missing)
             if (m_selected_line >= static_cast<size_t>(m_first_visible_line)) {
@@ -6909,8 +6910,8 @@ public:
           } else {
             // File changed, set selected line to the line with the PC
             m_selected_line = m_pc_line;
-            m_file_sp =
-                m_debugger.GetSourceManager().GetFile(m_sc.line_entry.file);
+            m_file_sp = m_debugger.GetSourceManager().GetFile(
+                m_sc.line_entry.GetFile());
             if (m_file_sp) {
               const size_t num_lines = m_file_sp->GetNumLines();
               m_line_width = 1;
@@ -7000,7 +7001,7 @@ public:
             LineEntry bp_loc_line_entry;
             if (bp_loc_sp->GetAddress().CalculateSymbolContextLineEntry(
                     bp_loc_line_entry)) {
-              if (m_file_sp->GetFileSpec() == bp_loc_line_entry.file) {
+              if (m_file_sp->GetFileSpec() == bp_loc_line_entry.GetFile()) {
                 bp_lines.insert(bp_loc_line_entry.line);
               }
             }
@@ -7477,7 +7478,7 @@ public:
           LineEntry bp_loc_line_entry;
           if (bp_loc_sp->GetAddress().CalculateSymbolContextLineEntry(
                   bp_loc_line_entry)) {
-            if (m_file_sp->GetFileSpec() == bp_loc_line_entry.file &&
+            if (m_file_sp->GetFileSpec() == bp_loc_line_entry.GetFile() &&
                 m_selected_line + 1 == bp_loc_line_entry.line) {
               bool removed =
                   exe_ctx.GetTargetRef().RemoveBreakpointByID(bp_sp->GetID());
diff --git a/lldb/source/Core/SourceManager.cpp b/lldb/source/Core/SourceManager.cpp
index 517a4b0268d2..0d70c554e534 100644
--- a/lldb/source/Core/SourceManager.cpp
+++ b/lldb/source/Core/SourceManager.cpp
@@ -418,7 +418,7 @@ bool SourceManager::GetDefaultFileAndLine(FileSpec &file_spec, uint32_t &line) {
             if (sc.function->GetAddressRange()
                     .GetBaseAddress()
                     .CalculateSymbolContextLineEntry(line_entry)) {
-              SetDefaultFileAndLine(line_entry.file, line_entry.line);
+              SetDefaultFileAndLine(line_entry.GetFile(), line_entry.line);
               file_spec = m_last_file_spec;
               line = m_last_line;
               return true;
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionSourceCode.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionSourceCode.cpp
index 3d43ed3f99ff..3b601726388d 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionSourceCode.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionSourceCode.cpp
@@ -417,7 +417,7 @@ bool ClangExpressionSourceCode::GetText(
     if (sc.comp_unit && sc.line_entry.IsValid()) {
       DebugMacros *dm = sc.comp_unit->GetDebugMacros();
       if (dm) {
-        AddMacroState state(sc.line_entry.file, sc.line_entry.line);
+        AddMacroState state(sc.line_entry.GetFile(), sc.line_entry.line);
         AddMacros(dm, sc.comp_unit, state, debug_macros_stream);
       }
     }
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
index 10a1fe039189..bcb04fae15bd 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
@@ -106,13 +106,13 @@ bool lldb_private::formatters::LibcxxFunctionSummaryProvider(
   case CPPLanguageRuntime::LibCppStdFunctionCallableCase::Lambda:
     stream.Printf(
         " Lambda in File %s at Line %u",
-        callable_info.callable_line_entry.file.GetFilename().GetCString(),
+        callable_info.callable_line_entry.GetFile().GetFilename().GetCString(),
         callable_info.callable_line_entry.line);
     break;
   case CPPLanguageRuntime::LibCppStdFunctionCallableCase::CallableObject:
     stream.Printf(
         " Function in File %s at Line %u",
-        callable_info.callable_line_entry.file.GetFilename().GetCString(),
+        callable_info.callable_line_entry.GetFile().GetFilename().GetCString(),
         callable_info.callable_line_entry.line);
     break;
   case CPPLanguageRuntime::LibCppStdFunctionCallableCase::FreeOrMemberFunction:
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 5f67658f86ea..08ce7b82b0c1 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -4377,26 +4377,38 @@ const std::shared_ptr<SymbolFileDWARFDwo> &SymbolFileDWARF::GetDwpSymbolFile() {
     FileSpecList search_paths = Target::GetDefaultDebugFileSearchPaths();
     ModuleSpec module_spec;
     module_spec.GetFileSpec() = m_objfile_sp->GetFileSpec();
+    FileSpec dwp_filespec;
     for (const auto &symfile : symfiles.files()) {
       module_spec.GetSymbolFileSpec() =
           FileSpec(symfile.GetPath() + ".dwp", symfile.GetPathStyle());
       LLDB_LOG(log, "Searching for DWP using: \"{0}\"",
                module_spec.GetSymbolFileSpec());
-      FileSpec dwp_filespec =
+      dwp_filespec =
           PluginManager::LocateExecutableSymbolFile(module_spec, search_paths);
       if (FileSystem::Instance().Exists(dwp_filespec)) {
-        LLDB_LOG(log, "Found DWP file: \"{0}\"", dwp_filespec);
-        DataBufferSP dwp_file_data_sp;
-        lldb::offset_t dwp_file_data_offset = 0;
-        ObjectFileSP dwp_obj_file = ObjectFile::FindPlugin(
-            GetObjectFile()->GetModule(), &dwp_filespec, 0,
-            FileSystem::Instance().GetByteSize(dwp_filespec), dwp_file_data_sp,
-            dwp_file_data_offset);
-        if (dwp_obj_file) {
-          m_dwp_symfile = std::make_shared<SymbolFileDWARFDwo>(
-              *this, dwp_obj_file, DIERef::k_file_index_mask);
-          break;
-        }
+        break;
+      }
+    }
+    if (!FileSystem::Instance().Exists(dwp_filespec)) {
+      LLDB_LOG(log, "No DWP file found locally");
+      // Fill in the UUID for the module we're trying to match for, so we can
+      // find the correct DWP file, as the Debuginfod plugin uses *only* this
+      // data to correctly match the DWP file with the binary.
+      module_spec.GetUUID() = m_objfile_sp->GetUUID();
+      dwp_filespec =
+          PluginManager::LocateExecutableSymbolFile(module_spec, search_paths);
+    }
+    if (FileSystem::Instance().Exists(dwp_filespec)) {
+      LLDB_LOG(log, "Found DWP file: \"{0}\"", dwp_filespec);
+      DataBufferSP dwp_file_data_sp;
+      lldb::offset_t dwp_file_data_offset = 0;
+      ObjectFileSP dwp_obj_file = ObjectFile::FindPlugin(
+          GetObjectFile()->GetModule(), &dwp_filespec, 0,
+          FileSystem::Instance().GetByteSize(dwp_filespec), dwp_file_data_sp,
+          dwp_file_data_offset);
+      if (dwp_obj_file) {
+        m_dwp_symfile = std::make_shared<SymbolFileDWARFDwo>(
+            *this, dwp_obj_file, DIERef::k_file_index_mask);
       }
     }
     if (!m_dwp_symfile) {
diff --git a/lldb/source/Plugins/SymbolLocator/CMakeLists.txt b/lldb/source/Plugins/SymbolLocator/CMakeLists.txt
index ca969626f4ff..3367022639ab 100644
--- a/lldb/source/Plugins/SymbolLocator/CMakeLists.txt
+++ b/lldb/source/Plugins/SymbolLocator/CMakeLists.txt
@@ -1,5 +1,10 @@
+# Order matters here: the first symbol locator prevents further searching.
+# For DWARF binaries that are both stripped and split, the Default plugin
+# will return the stripped binary when asked for the ObjectFile, which then
+# prevents an unstripped binary from being requested from the Debuginfod
+# provider.
+add_subdirectory(Debuginfod)
 add_subdirectory(Default)
 if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
   add_subdirectory(DebugSymbols)
 endif()
-add_subdirectory(Debuginfod)
diff --git a/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp b/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp
index b5fe35d71032..91b8b4a979e0 100644
--- a/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp
+++ b/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp
@@ -44,6 +44,25 @@ llvm::StringRef SymbolVendorELF::GetPluginDescriptionStatic() {
          "executables.";
 }
 
+// If this is needed elsewhere, it can be exported/moved.
+static bool IsDwpSymbolFile(const lldb::ModuleSP &module_sp,
+                            const FileSpec &file_spec) {
+  DataBufferSP dwp_file_data_sp;
+  lldb::offset_t dwp_file_data_offset = 0;
+  // Try to create an ObjectFile from the file_spec.
+  ObjectFileSP dwp_obj_file = ObjectFile::FindPlugin(
+      module_sp, &file_spec, 0, FileSystem::Instance().GetByteSize(file_spec),
+      dwp_file_data_sp, dwp_file_data_offset);
+  if (!ObjectFileELF::classof(dwp_obj_file.get()))
+    return false;
+  // The presence of a debug_cu_index section is the key identifying feature of
+  // a DWP file. Make sure we don't fill in the section list on dwp_obj_file
+  // (by calling GetSectionList(false)) as this is invoked before we may have
+  // all the symbol files collected and available.
+  return dwp_obj_file && dwp_obj_file->GetSectionList(false)->FindSectionByType(
+                             eSectionTypeDWARFDebugCuIndex, false);
+}
+
 // CreateInstance
 //
 // Platforms can register a callback to use when creating symbol vendors to
@@ -87,8 +106,15 @@ SymbolVendorELF::CreateInstance(const lldb::ModuleSP &module_sp,
   FileSpecList search_paths = Target::GetDefaultDebugFileSearchPaths();
   FileSpec dsym_fspec =
       PluginManager::LocateExecutableSymbolFile(module_spec, search_paths);
-  if (!dsym_fspec)
-    return nullptr;
+  if (!dsym_fspec || IsDwpSymbolFile(module_sp, dsym_fspec)) {
+    // If we have a stripped binary or if we got a DWP file, we should prefer
+    // symbols in the executable acquired through a plugin.
+    ModuleSpec unstripped_spec =
+        PluginManager::LocateExecutableObjectFile(module_spec);
+    if (!unstripped_spec)
+      return nullptr;
+    dsym_fspec = unstripped_spec.GetFileSpec();
+  }
 
   DataBufferSP dsym_file_data_sp;
   lldb::offset_t dsym_file_data_offset = 0;
diff --git a/lldb/source/Symbol/CompileUnit.cpp b/lldb/source/Symbol/CompileUnit.cpp
index 1b3cd23d9400..ddeacf18e855 100644
--- a/lldb/source/Symbol/CompileUnit.cpp
+++ b/lldb/source/Symbol/CompileUnit.cpp
@@ -320,7 +320,7 @@ void CompileUnit::ResolveSymbolContext(
       src_location_spec.GetColumn() ? std::optional<uint16_t>(line_entry.column)
                                     : std::nullopt;
 
-  SourceLocationSpec found_entry(line_entry.file, line_entry.line, column,
+  SourceLocationSpec found_entry(line_entry.GetFile(), line_entry.line, column,
                                  inlines, exact);
 
   while (line_idx != UINT32_MAX) {
diff --git a/lldb/source/Symbol/Function.cpp b/lldb/source/Symbol/Function.cpp
index fdc090355771..194f89bc51d8 100644
--- a/lldb/source/Symbol/Function.cpp
+++ b/lldb/source/Symbol/Function.cpp
@@ -289,7 +289,7 @@ void Function::GetStartLineSourceInfo(FileSpec &source_file,
     if (line_table->FindLineEntryByAddress(GetAddressRange().GetBaseAddress(),
                                            line_entry, nullptr)) {
       line_no = line_entry.line;
-      source_file = line_entry.file;
+      source_file = line_entry.GetFile();
     }
   }
 }
@@ -311,7 +311,7 @@ void Function::GetEndLineSourceInfo(FileSpec &source_file, uint32_t &line_no) {
   LineEntry line_entry;
   if (line_table->FindLineEntryByAddress(scratch_addr, line_entry, nullptr)) {
     line_no = line_entry.line;
-    source_file = line_entry.file;
+    source_file = line_entry.GetFile();
   }
 }
 
diff --git a/lldb/source/Symbol/LineEntry.cpp b/lldb/source/Symbol/LineEntry.cpp
index 389f8dcb65d8..461399e0326e 100644
--- a/lldb/source/Symbol/LineEntry.cpp
+++ b/lldb/source/Symbol/LineEntry.cpp
@@ -14,12 +14,14 @@
 using namespace lldb_private;
 
 LineEntry::LineEntry()
-    : range(), file(), is_start_of_statement(0), is_start_of_basic_block(0),
-      is_prologue_end(0), is_epilogue_begin(0), is_terminal_entry(0) {}
+    : range(), file_sp(std::make_shared<SupportFile>()),
+      original_file_sp(std::make_shared<SupportFile>()),
+      is_start_of_statement(0), is_start_of_basic_block(0), is_prologue_end(0),
+      is_epilogue_begin(0), is_terminal_entry(0) {}
 
 void LineEntry::Clear() {
   range.Clear();
-  file.Clear();
+  file_sp = std::make_shared<SupportFile>();
   original_file_sp = std::make_shared<SupportFile>();
   line = LLDB_INVALID_LINE_NUMBER;
   column = 0;
@@ -35,6 +37,7 @@ bool LineEntry::IsValid() const {
 }
 
 bool LineEntry::DumpStopContext(Stream *s, bool show_fullpaths) const {
+  const FileSpec &file = file_sp->GetSpecOnly();
   if (file) {
     if (show_fullpaths)
       file.Dump(s->AsRawOstream());
@@ -67,7 +70,7 @@ bool LineEntry::Dump(Stream *s, Target *target, bool show_file,
       return false;
   }
   if (show_file)
-    *s << ", file = " << file;
+    *s << ", file = " << GetFile();
   if (line)
     s->Printf(", line = %u", line);
   if (column)
@@ -103,7 +106,7 @@ bool LineEntry::GetDescription(Stream *s, lldb::DescriptionLevel level,
                  Address::DumpStyleFileAddress);
     }
 
-    *s << ": " << file;
+    *s << ": " << GetFile();
 
     if (line) {
       s->Printf(":%u", line);
@@ -173,7 +176,7 @@ int LineEntry::Compare(const LineEntry &a, const LineEntry &b) {
   if (a.column > b.column)
     return +1;
 
-  return FileSpec::Compare(a.file, b.file, true);
+  return FileSpec::Compare(a.GetFile(), b.GetFile(), true);
 }
 
 AddressRange LineEntry::GetSameLineContiguousAddressRange(
@@ -242,6 +245,6 @@ void LineEntry::ApplyFileMappings(lldb::TargetSP target_sp) {
     // Apply any file remappings to our file.
     if (auto new_file_spec = target_sp->GetSourcePathMap().FindFile(
             original_file_sp->GetSpecOnly()))
-      file = *new_file_spec;
+      file_sp->Update(*new_file_spec);
   }
 }
diff --git a/lldb/source/Symbol/LineTable.cpp b/lldb/source/Symbol/LineTable.cpp
index 444135f63bc0..06cf4f698316 100644
--- a/lldb/source/Symbol/LineTable.cpp
+++ b/lldb/source/Symbol/LineTable.cpp
@@ -288,8 +288,8 @@ bool LineTable::ConvertEntryAtIndexToLineEntry(uint32_t idx,
   else
     line_entry.range.SetByteSize(0);
 
-  line_entry.file =
-      m_comp_unit->GetSupportFiles().GetFileSpecAtIndex(entry.file_idx);
+  line_entry.file_sp = std::make_shared<SupportFile>(
+      m_comp_unit->GetSupportFiles().GetFileSpecAtIndex(entry.file_idx));
   line_entry.original_file_sp =
       m_comp_unit->GetSupportFiles().GetSupportFileAtIndex(entry.file_idx);
   line_entry.line = entry.line;
diff --git a/lldb/source/Symbol/SymbolContext.cpp b/lldb/source/Symbol/SymbolContext.cpp
index 3c70b8d8743c..f368896fbad4 100644
--- a/lldb/source/Symbol/SymbolContext.cpp
+++ b/lldb/source/Symbol/SymbolContext.cpp
@@ -472,8 +472,8 @@ bool SymbolContext::GetParentOfInlinedScope(const Address &curr_frame_pc,
             curr_inlined_block->GetInlinedFunctionInfo();
         next_frame_pc = range.GetBaseAddress();
         next_frame_sc.line_entry.range.GetBaseAddress() = next_frame_pc;
-        next_frame_sc.line_entry.file =
-            curr_inlined_block_inlined_info->GetCallSite().GetFile();
+        next_frame_sc.line_entry.file_sp = std::make_shared<SupportFile>(
+            curr_inlined_block_inlined_info->GetCallSite().GetFile());
         next_frame_sc.line_entry.original_file_sp =
             std::make_shared<SupportFile>(
                 curr_inlined_block_inlined_info->GetCallSite().GetFile());
diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp
index c29a71d92572..3af62f52d575 100644
--- a/lldb/source/Target/StackFrame.cpp
+++ b/lldb/source/Target/StackFrame.cpp
@@ -1922,7 +1922,7 @@ bool StackFrame::GetStatus(Stream &strm, bool show_frame_info, bool show_source,
 
           size_t num_lines =
               target->GetSourceManager().DisplaySourceLinesWithLineNumbers(
-                  m_sc.line_entry.file, start_line, m_sc.line_entry.column,
+                  m_sc.line_entry.GetFile(), start_line, m_sc.line_entry.column,
                   source_lines_before, source_lines_after, "->", &strm);
           if (num_lines != 0)
             have_source = true;
diff --git a/lldb/source/Target/StackFrameList.cpp b/lldb/source/Target/StackFrameList.cpp
index 2273e52e2e04..314b5e39c716 100644
--- a/lldb/source/Target/StackFrameList.cpp
+++ b/lldb/source/Target/StackFrameList.cpp
@@ -884,9 +884,9 @@ void StackFrameList::SetDefaultFileAndLineToSelectedFrame() {
         GetFrameAtIndex(GetSelectedFrameIndex(DoNoSelectMostRelevantFrame)));
     if (frame_sp) {
       SymbolContext sc = frame_sp->GetSymbolContext(eSymbolContextLineEntry);
-      if (sc.line_entry.file)
+      if (sc.line_entry.GetFile())
         m_thread.CalculateTarget()->GetSourceManager().SetDefaultFileAndLine(
-            sc.line_entry.file, sc.line_entry.line);
+            sc.line_entry.GetFile(), sc.line_entry.line);
     }
   }
 }
diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp
index 4dfad23b56e2..412e44ede9c1 100644
--- a/lldb/source/Target/Thread.cpp
+++ b/lldb/source/Target/Thread.cpp
@@ -302,10 +302,10 @@ bool Thread::SetSelectedFrameByIndexNoisily(uint32_t frame_idx,
       SymbolContext frame_sc(
           frame_sp->GetSymbolContext(eSymbolContextLineEntry));
       const Debugger &debugger = GetProcess()->GetTarget().GetDebugger();
-      if (debugger.GetUseExternalEditor() && frame_sc.line_entry.file &&
+      if (debugger.GetUseExternalEditor() && frame_sc.line_entry.GetFile() &&
           frame_sc.line_entry.line != 0) {
         if (llvm::Error e = Host::OpenFileInExternalEditor(
-                debugger.GetExternalEditor(), frame_sc.line_entry.file,
+                debugger.GetExternalEditor(), frame_sc.line_entry.GetFile(),
                 frame_sc.line_entry.line)) {
           LLDB_LOG_ERROR(GetLog(LLDBLog::Host), std::move(e),
                          "OpenFileInExternalEditor failed: {0}");
@@ -1753,10 +1753,10 @@ size_t Thread::GetStatus(Stream &strm, uint32_t start_frame,
       if (frame_sp) {
         SymbolContext frame_sc(
             frame_sp->GetSymbolContext(eSymbolContextLineEntry));
-        if (frame_sc.line_entry.line != 0 && frame_sc.line_entry.file) {
+        if (frame_sc.line_entry.line != 0 && frame_sc.line_entry.GetFile()) {
           if (llvm::Error e = Host::OpenFileInExternalEditor(
                   target->GetDebugger().GetExternalEditor(),
-                  frame_sc.line_entry.file, frame_sc.line_entry.line)) {
+                  frame_sc.line_entry.GetFile(), frame_sc.line_entry.line)) {
             LLDB_LOG_ERROR(GetLog(LLDBLog::Host), std::move(e),
                            "OpenFileInExternalEditor failed: {0}");
           }
diff --git a/lldb/source/Target/TraceDumper.cpp b/lldb/source/Target/TraceDumper.cpp
index e92419e70b32..4ef8efc1a676 100644
--- a/lldb/source/Target/TraceDumper.cpp
+++ b/lldb/source/Target/TraceDumper.cpp
@@ -57,7 +57,7 @@ static bool FileLineAndColumnMatches(const LineEntry &a, const LineEntry &b) {
     return false;
   if (a.column != b.column)
     return false;
-  return a.file == b.file;
+  return a.GetFile() == b.GetFile();
 }
 
 /// Compare the symbol contexts of the provided \a SymbolInfo
@@ -396,7 +396,7 @@ public:
         m_j.attribute(
             "source",
             ToOptionalString(
-                item.symbol_info->sc.line_entry.file.GetPath().c_str()));
+                item.symbol_info->sc.line_entry.GetFile().GetPath().c_str()));
         m_j.attribute("line", item.symbol_info->sc.line_entry.line);
         m_j.attribute("column", item.symbol_info->sc.line_entry.column);
       }
diff --git a/lldb/test/API/debuginfod/Normal/Makefile b/lldb/test/API/debuginfod/Normal/Makefile
new file mode 100644
index 000000000000..bd2fa623df47
--- /dev/null
+++ b/lldb/test/API/debuginfod/Normal/Makefile
@@ -0,0 +1,25 @@
+C_SOURCES := main.c
+
+# For normal (non DWP) Debuginfod tests, we need:
+
+# * The "full" binary: a.out.debug
+#   Produced by Makefile.rules with KEEP_FULL_DEBUG_BINARY set to YES and
+#   SPLIT_DEBUG_SYMBOLS set to YES
+
+# * The stripped binary (a.out)
+#   Produced by Makefile.rules with SPLIT_DEBUG_SYMBOLS set to YES
+
+# * The 'only-keep-debug' binary (a.out.dbg)
+#   Produced below
+
+# * The .uuid file (for a little easier testing code)
+#   Produced below
+
+# Don't strip the debug info from a.out:
+SPLIT_DEBUG_SYMBOLS := YES
+SAVE_FULL_DEBUG_BINARY := YES
+GEN_GNU_BUILD_ID := YES
+
+all: a.out.uuid a.out
+
+include Makefile.rules
diff --git a/lldb/test/API/debuginfod/Normal/TestDebuginfod.py b/lldb/test/API/debuginfod/Normal/TestDebuginfod.py
new file mode 100644
index 000000000000..eb5efe83c17a
--- /dev/null
+++ b/lldb/test/API/debuginfod/Normal/TestDebuginfod.py
@@ -0,0 +1,185 @@
+import os
+import shutil
+import tempfile
+import struct
+
+import lldb
+from lldbsuite.test.decorators import *
+import lldbsuite.test.lldbutil as lldbutil
+from lldbsuite.test.lldbtest import *
+
+
+def getUUID(aoutuuid):
+    """
+    Pull the 20 byte UUID out of the .note.gnu.build-id section that was dumped
+    to a file already, as part of the build.
+    """
+    with open(aoutuuid, "rb") as f:
+        data = f.read(36)
+        if len(data) != 36:
+            return None
+        header = struct.unpack_from("<4I", data)
+        if len(header) != 4:
+            return None
+        # 4 element 'prefix', 20 bytes of uuid, 3 byte long string: 'GNU':
+        if header[0] != 4 or header[1] != 20 or header[2] != 3 or header[3] != 0x554E47:
+            return None
+        return data[16:].hex()
+
+
+"""
+Test support for the DebugInfoD network symbol acquisition protocol.
+This one is for simple / no split-dwarf scenarios.
+
+For no-split-dwarf scenarios, there are 2 variations:
+1 - A stripped binary with it's corresponding unstripped binary:
+2 - A stripped binary with a corresponding --only-keep-debug symbols file
+"""
+
+
+@skipUnlessPlatform(["linux", "freebsd"])
+class DebugInfodTests(TestBase):
+    # No need to try every flavor of debug inf.
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def test_normal_no_symbols(self):
+        """
+        Validate behavior with no symbols or symbol locator.
+        ('baseline negative' behavior)
+        """
+        test_root = self.config_test(["a.out"])
+        self.try_breakpoint(False)
+
+    def test_normal_default(self):
+        """
+        Validate behavior with symbols, but no symbol locator.
+        ('baseline positive' behavior)
+        """
+        test_root = self.config_test(["a.out", "a.out.debug"])
+        self.try_breakpoint(True)
+
+    def test_debuginfod_symbols(self):
+        """
+        Test behavior with the full binary available from Debuginfod as
+        'debuginfo' from the plug-in.
+        """
+        test_root = self.config_test(["a.out"], "a.out.full")
+        self.try_breakpoint(True)
+
+    def test_debuginfod_executable(self):
+        """
+        Test behavior with the full binary available from Debuginfod as
+        'executable' from the plug-in.
+        """
+        test_root = self.config_test(["a.out"], None, "a.out.full")
+        self.try_breakpoint(True)
+
+    def test_debuginfod_okd_symbols(self):
+        """
+        Test behavior with the 'only-keep-debug' symbols available from Debuginfod.
+        """
+        test_root = self.config_test(["a.out"], "a.out.debug")
+        self.try_breakpoint(True)
+
+    def try_breakpoint(self, should_have_loc):
+        """
+        This function creates a target from self.aout, sets a function-name
+        breakpoint, and checks to see if we have a file/line location,
+        as a way to validate that the symbols have been loaded.
+        should_have_loc specifies if we're testing that symbols have or
+        haven't been loaded.
+        """
+        target = self.dbg.CreateTarget(self.aout)
+        self.assertTrue(target and target.IsValid(), "Target is valid")
+
+        bp = target.BreakpointCreateByName("func")
+        self.assertTrue(bp and bp.IsValid(), "Breakpoint is valid")
+        self.assertEqual(bp.GetNumLocations(), 1)
+
+        loc = bp.GetLocationAtIndex(0)
+        self.assertTrue(loc and loc.IsValid(), "Location is valid")
+        addr = loc.GetAddress()
+        self.assertTrue(addr and addr.IsValid(), "Loc address is valid")
+        line_entry = addr.GetLineEntry()
+        self.assertEqual(
+            should_have_loc,
+            line_entry != None and line_entry.IsValid(),
+            "Loc line entry is valid",
+        )
+        if should_have_loc:
+            self.assertEqual(line_entry.GetLine(), 4)
+            self.assertEqual(
+                line_entry.GetFileSpec().GetFilename(),
+                self.main_source_file.GetFilename(),
+            )
+        self.dbg.DeleteTarget(target)
+        shutil.rmtree(self.tmp_dir)
+
+    def config_test(self, local_files, debuginfo=None, executable=None):
+        """
+        Set up a test with local_files[] copied to a different location
+        so that we control which files are, or are not, found in the file system.
+        Also, create a stand-alone file-system 'hosted' debuginfod server with the
+        provided debuginfo and executable files (if they exist)
+
+        Make the filesystem look like:
+
+        /tmp/<tmpdir>/test/[local_files]
+
+        /tmp/<tmpdir>/cache (for lldb to use as a temp cache)
+
+        /tmp/<tmpdir>/buildid/<uuid>/executable -> <executable>
+        /tmp/<tmpdir>/buildid/<uuid>/debuginfo -> <debuginfo>
+        Returns the /tmp/<tmpdir> path
+        """
+
+        self.build()
+
+        uuid = getUUID(self.getBuildArtifact("a.out.uuid"))
+
+        self.main_source_file = lldb.SBFileSpec("main.c")
+        self.tmp_dir = tempfile.mkdtemp()
+        test_dir = os.path.join(self.tmp_dir, "test")
+        os.makedirs(test_dir)
+
+        self.aout = ""
+        # Copy the files used by the test:
+        for f in local_files:
+            shutil.copy(self.getBuildArtifact(f), test_dir)
+            # The first item is the binary to be used for the test
+            if self.aout == "":
+                self.aout = os.path.join(test_dir, f)
+
+        use_debuginfod = debuginfo != None or executable != None
+
+        # Populated the 'file://... mocked' Debuginfod server:
+        if use_debuginfod:
+            os.makedirs(os.path.join(self.tmp_dir, "cache"))
+            uuid_dir = os.path.join(self.tmp_dir, "buildid", uuid)
+            os.makedirs(uuid_dir)
+            if debuginfo:
+                shutil.copy(
+                    self.getBuildArtifact(debuginfo),
+                    os.path.join(uuid_dir, "debuginfo"),
+                )
+            if executable:
+                shutil.copy(
+                    self.getBuildArtifact(executable),
+                    os.path.join(uuid_dir, "executable"),
+                )
+
+        # Configure LLDB for the test:
+        self.runCmd(
+            "settings set symbols.enable-external-lookup %s"
+            % str(use_debuginfod).lower()
+        )
+        self.runCmd("settings clear plugin.symbol-locator.debuginfod.server-urls")
+        if use_debuginfod:
+            self.runCmd(
+                "settings set plugin.symbol-locator.debuginfod.cache-path %s/cache"
+                % self.tmp_dir
+            )
+            self.runCmd(
+                "settings insert-before plugin.symbol-locator.debuginfod.server-urls 0 file://%s"
+                % self.tmp_dir
+            )
diff --git a/lldb/test/API/debuginfod/Normal/main.c b/lldb/test/API/debuginfod/Normal/main.c
new file mode 100644
index 000000000000..4c7184609b45
--- /dev/null
+++ b/lldb/test/API/debuginfod/Normal/main.c
@@ -0,0 +1,7 @@
+// This is a dump little pair of test files
+
+int func(int argc, const char *argv[]) {
+  return (argc + 1) * (argv[argc][0] + 2);
+}
+
+int main(int argc, const char *argv[]) { return func(0, argv); }
diff --git a/lldb/test/API/debuginfod/SplitDWARF/Makefile b/lldb/test/API/debuginfod/SplitDWARF/Makefile
new file mode 100644
index 000000000000..266d74cf9062
--- /dev/null
+++ b/lldb/test/API/debuginfod/SplitDWARF/Makefile
@@ -0,0 +1,28 @@
+C_SOURCES := main.c
+
+# For split-dwarf Debuginfod tests, we need:
+
+# * A .DWP file (a.out.dwp)
+#   Produced by Makefile.rules with MAKE_DWO and MERGE_DWOS both set to YES
+
+# * The "full" binary: it's missing things that live in .dwo's (a.out.debug)
+#   Produced by Makefile.rules with KEEP_FULL_DEBUG_BINARY set to YES and
+#   SPLIT_DEBUG_SYMBOLS set to YES
+
+# * The stripped binary (a.out)
+#   Produced by Makefile.rules
+
+# * The 'only-keep-debug' binary (a.out.dbg)
+#   Produced below
+
+# * The .uuid file (for a little easier testing code)
+#   Produced here in the rule below
+
+MAKE_DWP := YES
+SPLIT_DEBUG_SYMBOLS := YES
+SAVE_FULL_DEBUG_BINARY := YES
+GEN_GNU_BUILD_ID := YES
+
+all: a.out.uuid a.out
+
+include Makefile.rules
diff --git a/lldb/test/API/debuginfod/SplitDWARF/TestDebuginfodDWP.py b/lldb/test/API/debuginfod/SplitDWARF/TestDebuginfodDWP.py
new file mode 100644
index 000000000000..09f91b6f1c6c
--- /dev/null
+++ b/lldb/test/API/debuginfod/SplitDWARF/TestDebuginfodDWP.py
@@ -0,0 +1,194 @@
+"""
+Test support for the DebugInfoD network symbol acquisition protocol.
+"""
+import os
+import shutil
+import tempfile
+import struct
+
+import lldb
+from lldbsuite.test.decorators import *
+import lldbsuite.test.lldbutil as lldbutil
+from lldbsuite.test.lldbtest import *
+
+
+def getUUID(aoutuuid):
+    """
+    Pull the 20 byte UUID out of the .note.gnu.build-id section that was dumped
+    to a file already, as part of the build.
+    """
+    with open(aoutuuid, "rb") as f:
+        data = f.read(36)
+        if len(data) != 36:
+            return None
+        header = struct.unpack_from("<4I", data)
+        if len(header) != 4:
+            return None
+        # 4 element 'prefix', 20 bytes of uuid, 3 byte long string: 'GNU':
+        if header[0] != 4 or header[1] != 20 or header[2] != 3 or header[3] != 0x554E47:
+            return None
+        return data[16:].hex()
+
+
+"""
+Test support for the DebugInfoD network symbol acquisition protocol.
+This file is for split-dwarf (dwp) scenarios.
+
+1 - A split binary target with it's corresponding DWP file
+2 - A stripped, split binary target with an unstripped binary and a DWP file
+3 - A stripped, split binary target with an --only-keep-debug symbols file and a DWP file
+"""
+
+
+@skipUnlessPlatform(["linux", "freebsd"])
+class DebugInfodDWPTests(TestBase):
+    # No need to try every flavor of debug inf.
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def test_normal_stripped(self):
+        """
+        Validate behavior with a stripped binary, no symbols or symbol locator.
+        """
+        self.config_test(["a.out"])
+        self.try_breakpoint(False)
+
+    def test_normal_stripped_split_with_dwp(self):
+        """
+        Validate behavior with symbols, but no symbol locator.
+        """
+        self.config_test(["a.out", "a.out.debug", "a.out.dwp"])
+        self.try_breakpoint(True)
+
+    def test_normal_stripped_only_dwp(self):
+        """
+        Validate behavior *with* dwp symbols only, but missing other symbols,
+        but no symbol locator. This shouldn't work: without the other symbols
+        DWO's appear mostly useless.
+        """
+        self.config_test(["a.out", "a.out.dwp"])
+        self.try_breakpoint(False)
+
+    def test_debuginfod_dwp_from_service(self):
+        """
+        Test behavior with the unstripped binary, and DWP from the service.
+        """
+        self.config_test(["a.out.debug"], "a.out.dwp")
+        self.try_breakpoint(True)
+
+    def test_debuginfod_both_symfiles_from_service(self):
+        """
+        Test behavior with a stripped binary, with the unstripped binary and
+        dwp symbols from Debuginfod.
+        """
+        self.config_test(["a.out"], "a.out.dwp", "a.out.full")
+        self.try_breakpoint(True)
+
+    def test_debuginfod_both_okd_symfiles_from_service(self):
+        """
+        Test behavior with both the only-keep-debug symbols and the dwp symbols
+        from Debuginfod.
+        """
+        self.config_test(["a.out"], "a.out.dwp", "a.out.debug")
+        self.try_breakpoint(True)
+
+    def try_breakpoint(self, should_have_loc):
+        """
+        This function creates a target from self.aout, sets a function-name
+        breakpoint, and checks to see if we have a file/line location,
+        as a way to validate that the symbols have been loaded.
+        should_have_loc specifies if we're testing that symbols have or
+        haven't been loaded.
+        """
+        target = self.dbg.CreateTarget(self.aout)
+        self.assertTrue(target and target.IsValid(), "Target is valid")
+
+        bp = target.BreakpointCreateByName("func")
+        self.assertTrue(bp and bp.IsValid(), "Breakpoint is valid")
+        self.assertEqual(bp.GetNumLocations(), 1)
+
+        loc = bp.GetLocationAtIndex(0)
+        self.assertTrue(loc and loc.IsValid(), "Location is valid")
+        addr = loc.GetAddress()
+        self.assertTrue(addr and addr.IsValid(), "Loc address is valid")
+        line_entry = addr.GetLineEntry()
+        self.assertEqual(
+            should_have_loc,
+            line_entry != None and line_entry.IsValid(),
+            "Loc line entry is valid",
+        )
+        if should_have_loc:
+            self.assertEqual(line_entry.GetLine(), 4)
+            self.assertEqual(
+                line_entry.GetFileSpec().GetFilename(),
+                self.main_source_file.GetFilename(),
+            )
+        self.dbg.DeleteTarget(target)
+        shutil.rmtree(self.tmp_dir)
+
+    def config_test(self, local_files, debuginfo=None, executable=None):
+        """
+        Set up a test with local_files[] copied to a different location
+        so that we control which files are, or are not, found in the file system.
+        Also, create a stand-alone file-system 'hosted' debuginfod server with the
+        provided debuginfo and executable files (if they exist)
+
+        Make the filesystem look like:
+
+        /tmp/<tmpdir>/test/[local_files]
+
+        /tmp/<tmpdir>/cache (for lldb to use as a temp cache)
+
+        /tmp/<tmpdir>/buildid/<uuid>/executable -> <executable>
+        /tmp/<tmpdir>/buildid/<uuid>/debuginfo -> <debuginfo>
+        Returns the /tmp/<tmpdir> path
+        """
+
+        self.build()
+
+        uuid = getUUID(self.getBuildArtifact("a.out.uuid"))
+
+        self.main_source_file = lldb.SBFileSpec("main.c")
+        self.tmp_dir = tempfile.mkdtemp()
+        self.test_dir = os.path.join(self.tmp_dir, "test")
+        os.makedirs(self.test_dir)
+
+        self.aout = ""
+        # Copy the files used by the test:
+        for f in local_files:
+            shutil.copy(self.getBuildArtifact(f), self.test_dir)
+            if self.aout == "":
+                self.aout = os.path.join(self.test_dir, f)
+
+        use_debuginfod = debuginfo != None or executable != None
+
+        # Populated the 'file://... mocked' Debuginfod server:
+        if use_debuginfod:
+            os.makedirs(os.path.join(self.tmp_dir, "cache"))
+            uuid_dir = os.path.join(self.tmp_dir, "buildid", uuid)
+            os.makedirs(uuid_dir)
+            if debuginfo:
+                shutil.copy(
+                    self.getBuildArtifact(debuginfo),
+                    os.path.join(uuid_dir, "debuginfo"),
+                )
+            if executable:
+                shutil.copy(
+                    self.getBuildArtifact(executable),
+                    os.path.join(uuid_dir, "executable"),
+                )
+        os.remove(self.getBuildArtifact("main.dwo"))
+        # Configure LLDB for the test:
+        self.runCmd(
+            "settings set symbols.enable-external-lookup %s"
+            % str(use_debuginfod).lower()
+        )
+        self.runCmd("settings clear plugin.symbol-locator.debuginfod.server-urls")
+        if use_debuginfod:
+            self.runCmd(
+                "settings set plugin.symbol-locator.debuginfod.cache-path %s/cache"
+                % self.tmp_dir
+            )
+            self.runCmd(
+                "settings insert-before plugin.symbol-locator.debuginfod.server-urls 0 file://%s"
+                % self.tmp_dir
+            )
diff --git a/lldb/test/API/debuginfod/SplitDWARF/main.c b/lldb/test/API/debuginfod/SplitDWARF/main.c
new file mode 100644
index 000000000000..4c7184609b45
--- /dev/null
+++ b/lldb/test/API/debuginfod/SplitDWARF/main.c
@@ -0,0 +1,7 @@
+// This is a dump little pair of test files
+
+int func(int argc, const char *argv[]) {
+  return (argc + 1) * (argv[argc][0] + 2);
+}
+
+int main(int argc, const char *argv[]) { return func(0, argv); }
diff --git a/lldb/unittests/SymbolFile/PDB/SymbolFilePDBTests.cpp b/lldb/unittests/SymbolFile/PDB/SymbolFilePDBTests.cpp
index f237dd63ab1c..4379ffac9d74 100644
--- a/lldb/unittests/SymbolFile/PDB/SymbolFilePDBTests.cpp
+++ b/lldb/unittests/SymbolFile/PDB/SymbolFilePDBTests.cpp
@@ -102,7 +102,7 @@ protected:
     EXPECT_EQ(line, entry.line);
     EXPECT_EQ(address, entry.range.GetBaseAddress());
 
-    EXPECT_TRUE(FileSpecMatchesAsBaseOrFull(spec, entry.file));
+    EXPECT_TRUE(FileSpecMatchesAsBaseOrFull(spec, entry.GetFile()));
   }
 
   bool ContainsCompileUnit(const SymbolContextList &sc_list,
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 29ea5005c0c4..6e6d6b157148 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1454,6 +1454,11 @@ The AMDGPU backend supports the following LLVM IR attributes.
                                              CLANG attribute [CLANG-ATTR]_. Clang only emits this attribute when all
                                              the three numbers are >= 1.
 
+     "amdgpu-no-agpr"                        Indicates the function will not require allocating AGPRs. This is only
+                                             relevant on subtargets with AGPRs. The behavior is undefined if a
+                                             function which requires AGPRs is reached through any function marked
+                                             with this attribute.
+
      ======================================= ==========================================================
 
 Calling Conventions
diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst
index 9d0cb7ad1195..985d16eb11cf 100644
--- a/llvm/docs/CommandGuide/llvm-objcopy.rst
+++ b/llvm/docs/CommandGuide/llvm-objcopy.rst
@@ -464,6 +464,19 @@ them.
  Read a list of symbols from <filename> and change their visibility to the
  specified value. Visibility values: default, internal, hidden, protected.
 
+.. option:: --skip-symbol <symbol>
+
+ Do not change the parameters of symbol ``<symbol>`` when executing other
+ options that can change the symbol's name, binding or visibility.
+
+.. option:: --skip-symbols <filename>
+
+ Do not change the parameters of symbols named in the file ``<filename>`` when
+ executing other options that can change the symbol's name, binding or
+ visibility. In the file, each line represents a single symbol, with leading
+ and trailing whitespace ignored, as is anything following a '#'.
+ Can be specified multiple times to read names from multiple files.
+
 .. option:: --split-dwo <dwo-file>
 
  Equivalent to running :program:`llvm-objcopy` with :option:`--extract-dwo` and
diff --git a/llvm/docs/InstCombineContributorGuide.md b/llvm/docs/InstCombineContributorGuide.md
new file mode 100644
index 000000000000..2416fd0920f6
--- /dev/null
+++ b/llvm/docs/InstCombineContributorGuide.md
@@ -0,0 +1,556 @@
+# InstCombine contributor guide
+
+This guide lays out a series of rules that contributions to InstCombine should
+follow. **Following these rules will results in much faster PR approvals.**
+
+## Tests
+
+### Precommit tests
+
+Tests for new optimizations or miscompilation fixes should be pre-committed.
+This means that you first commit the test with CHECK lines showing the behavior
+*without* your change. Your actual change will then only contain CHECK line
+diffs relative to that baseline.
+
+This means that pull requests should generally contain two commits: First,
+one commit adding new tests with baseline check lines. Second, a commit with
+functional changes and test diffs.
+
+If the second commit in your PR does not contain test diffs, you did something
+wrong. Either you made a mistake when generating CHECK lines, or your tests are
+not actually affected by your patch.
+
+Exceptions: When fixing assertion failures or infinite loops, do not pre-commit
+tests.
+
+### Use `update_test_checks.py`
+
+CHECK lines should be generated using the `update_test_checks.py` script. Do
+**not** manually edit check lines after using it.
+
+Be sure to use the correct opt binary when using the script. For example, if
+your build directory is `build`, then you'll want to run:
+
+```sh
+llvm/utils/update_test_checks.py --opt-binary build/bin/opt \
+    llvm/test/Transforms/InstCombine/the_test.ll
+```
+
+Exceptions: Hand-written CHECK lines are allowed for debuginfo tests.
+
+### General testing considerations
+
+Place all tests relating to a transform into a single file. If you are adding
+a regression test for a crash/miscompile in an existing transform, find the
+file where the existing tests are located. A good way to do that is to comment
+out the transform and see which tests fail.
+
+Make tests minimal. Only test exactly the pattern being transformed. If your
+original motivating case is a larger pattern that your fold enables to
+optimize in some non-trivial way, you may add it as well -- however, the bulk
+of the test coverage should be minimal.
+
+Give tests short, but meaningful names. Don't call them `@test1`, `@test2` etc. 
+For example, a test checking multi-use behavior of a fold involving the
+addition of two selects might be called `@add_of_selects_multi_use`.
+
+Add representative tests for each test category (discussed below), but don't
+test all combinations of everything. If you have multi-use tests, and you have
+commuted tests, you shouldn't also add commuted multi-use tests.
+
+Prefer to keep bit-widths for tests low to improve performance of proof checking using alive2. Using `i8` is better than `i128` where possible. 
+
+### Add negative tests
+
+Make sure to add tests for which your transform does **not** apply. Start with
+one of the test cases that succeeds and then create a sequence of negative
+tests, such that **exactly one** different pre-condition of your transform is
+not satisfied in each test.
+
+### Add multi-use tests
+
+Add multi-use tests that ensures your transform does not increase instruction
+count if some instructions have additional uses. The standard pattern is to
+introduce extra uses with function calls:
+
+```llvm
+declare void @use(i8)
+
+define i8 @add_mul_const_multi_use(i8 %x) {
+  %add = add i8 %x, 1
+  call void @use(i8 %add)
+  %mul = mul i8 %add, 3
+  ret i8 %mul
+}
+```
+
+Exceptions: For transform that only produce one instruction, multi-use tests
+may be omitted.
+
+### Add commuted tests
+
+If the transform involves commutative operations, add tests with commuted
+(swapped) operands.
+
+Make sure that the operand order stays intact in the CHECK lines of your
+pre-commited tests. You should not see something like this:
+
+```llvm
+; CHECK-NEXT: [[OR:%.*]] = or i8 [[X]], [[Y]]
+; ...
+%or = or i8 %y, %x
+```
+
+If this happens, you may need to change one of the operands to have higher
+complexity (include the "thwart" comment in that case):
+
+```llvm
+%y2 = mul i8 %y, %y ; thwart complexity-based canonicalization
+%or = or i8 %y, %x
+```
+
+### Add vector tests
+
+When possible, it is recommended to add at least one test that uses vectors
+instead of scalars.
+
+For patterns that include constants, we distinguish three kinds of tests.
+The first are "splat" vectors, where all the vector elements are the same.
+These tests *should* usually fold without additional effort.
+
+```llvm
+define <2 x i8> @add_mul_const_vec_splat(<2 x i8> %x) {
+  %add = add <2 x i8> %x, <i8 1, i8 1>
+  %mul = mul <2 x i8> %add, <i8 3, i8 3>
+  ret <2 x i8> %mul
+}
+```
+
+A minor variant is to replace some of the splat elements with poison. These
+will often also fold without additional effort.
+
+```llvm
+define <2 x i8> @add_mul_const_vec_splat_poison(<2 x i8> %x) {
+  %add = add <2 x i8> %x, <i8 1, i8 poison>
+  %mul = mul <2 x i8> %add, <i8 3, i8 poison>
+  ret <2 x i8> %mul
+}
+```
+
+Finally, you can have non-splat vectors, where the vector elements are not
+the same:
+
+```llvm
+define <2 x i8> @add_mul_const_vec_non_splat(<2 x i8> %x) {
+  %add = add <2 x i8> %x, <i8 1, i8 5>
+  %mul = mul <2 x i8> %add, <i8 3, i8 6>
+  ret <2 x i8> %mul
+}
+```
+
+Non-splat vectors will often not fold by default. You should **not** try to
+make them fold, unless doing so does not add **any** additional complexity.
+You should still add the test though, even if it does not fold.
+
+### Flag tests
+
+If your transform involves instructions that can have poison-generating flags,
+such as `nuw` and `nsw` on `add`, you should test how these interact with the
+transform.
+
+If your transform *requires* a certain flag for correctness, make sure to add
+negative tests missing the required flag.
+
+If your transform doesn't require flags for correctness, you should have tests
+for preservation behavior. If the input instructions have certain flags, are
+they preserved in the output instructions, if it is valid to preserve them?
+(This depends on the transform. Check with alive2.)
+
+The same also applies to fast-math-flags (FMF). In that case, please always
+test specific flags like `nnan`, `nsz` or `reassoc`, rather than the umbrella
+`fast` flag.
+
+### Other tests
+
+The test categories mentioned above are non-exhaustive. There may be more tests
+to be added, depending on the instructions involved in the transform. Some
+examples:
+
+ * For folds involving memory accesses like load/store, check that scalable vectors and non-byte-size types (like i3) are handled correctly. Also check that volatile/atomic are handled.
+ * For folds that interact with the bitwidth in some non-trivial way, check an illegal type like i13. Also confirm that the transform is correct for i1.
+ * For folds that involve phis, you may want to check that the case of multiple incoming values from one block is handled correctly.
+
+## Proofs
+
+Your pull request description should contain one or more
+[alive2 proofs](https://alive2.llvm.org/ce/) for the correctness of the
+proposed transform.
+
+### Basics
+
+Proofs are written using LLVM IR, by specifying a `@src` and `@tgt` function.
+It is possible to include multiple proofs in a single file by giving the src
+and tgt functions matching suffixes.
+
+For example, here is a pair of proofs that both `(x-y)+y` and `(x+y)-y` can
+be simplified to `x` ([online](https://alive2.llvm.org/ce/z/MsPPGz)):
+
+```llvm
+define i8 @src_add_sub(i8 %x, i8 %y) {
+  %add = add i8 %x, %y
+  %sub = sub i8 %add, %y
+  ret i8 %sub
+}
+
+define i8 @tgt_add_sub(i8 %x, i8 %y) {
+  ret i8 %x
+}
+
+
+define i8 @src_sub_add(i8 %x, i8 %y) {
+  %sub = sub i8 %x, %y
+  %add = add i8 %sub, %y
+  ret i8 %add
+}
+
+define i8 @tgt_sub_add(i8 %x, i8 %y) {
+  ret i8 %x
+}
+```
+
+### Use generic values in proofs
+
+Proofs should operate on generic values, rather than specific constants, to the degree that this is possible.
+
+For example, if we want to fold `X s/ C s< X` to `X s> 0`, the following would
+be a *bad* proof:
+
+```llvm
+; Don't do this!
+define i1 @src(i8 %x) {
+  %div = sdiv i8 %x, 123
+  %cmp = icmp slt i8 %div, %x
+  ret i1 %cmp
+}
+
+define i1 @tgt(i8 %x) {
+  %cmp = icmp sgt i8 %x, 0
+  ret i1 %cmp
+}
+```
+
+This is because it only proves that the transform is correct for the specific
+constant 123. Maybe there are some constants for which the transform is
+incorrect?
+
+The correct way to write this proof is as follows
+([online](https://alive2.llvm.org/ce/z/acjwb6)):
+
+```llvm
+define i1 @src(i8 %x, i8 %C) {
+  %precond = icmp ne i8 %C, 1
+  call void @llvm.assume(i1 %precond)
+  %div = sdiv i8 %x, %C
+  %cmp = icmp slt i8 %div, %x
+  ret i1 %cmp
+}
+
+define i1 @tgt(i8 %x, i8 %C) {
+  %cmp = icmp sgt i8 %x, 0
+  ret i1 %cmp
+}
+```
+
+Note that the `@llvm.assume` intrinsic is used to specify pre-conditions for
+the transform. In this case, the proof will fail unless we specify `C != 1` as
+a pre-condition.
+
+It should be emphasized that there is, in general, no expectation that the
+IR in the proofs will be transformed by the implemented fold. In the above
+example, the transform would only apply if `%C` is actually a constant, but we
+need to use non-constants in the proof.
+
+### Common pre-conditions
+
+Here are some examples of common preconditions.
+
+```llvm
+; %x is non-negative:
+%nonneg = icmp sgt i8 %x, -1
+call void @llvm.assume(i1 %nonneg)
+
+; %x is a power of two:
+%ctpop = call i8 @llvm.ctpop.i8(i8 %x)
+%pow2 = icmp eq i8 %x, 1
+call void @llvm.assume(i1 %pow2)
+
+; %x is a power of two or zero:
+%ctpop = call i8 @llvm.ctpop.i8(i8 %x)
+%pow2orzero = icmp ult i8 %x, 2
+call void @llvm.assume(i1 %pow2orzero)
+
+; Adding %x and %y does not overflow in a signed sense:
+%wo = call { i8, i1 } @llvm.sadd.with.overflow(i8 %x, i8 %y)
+%ov = extractvalue { i8, i1 } %wo, 1
+%ov.not = xor i1 %ov, true
+call void @llvm.assume(i1 %ov.not)
+```
+
+### Timeouts
+
+Alive2 proofs will sometimes produce a timeout with the following message: 
+
+```
+Alive2 timed out while processing your query.
+There are a few things you can try:
+
+- remove extraneous instructions, if any
+
+- reduce variable widths, for example to i16, i8, or i4
+
+- add the --disable-undef-input command line flag, which
+  allows Alive2 to assume that arguments to your IR are not
+  undef. This is, in general, unsound: it can cause Alive2
+  to miss bugs.
+```
+
+This is good advice, follow it!
+
+Reducing the bitwidth usually helps. For floating point numbers, you can use
+the `half` type for bitwidth reduction purposes. For pointers, you can reduce
+the bitwidth by specifying a custom data layout:
+
+```llvm
+; For 16-bit pointers
+target datalayout = "p:16:16"
+```
+
+If reducing the bitwidth does not help, try `-disable-undef-input`. This will
+often significantly improve performance, but also implies that the correctness
+of the transform with `undef` values is no longer verified. This is usually
+fine if the transform does not increase the number of uses of any value.
+
+Finally, it's possible to build alive2 locally and use `-smt-to=<m>` to verify
+the proof with a larger timeout. If you don't want to do this (or it still
+does not work), please submit the proof you have despite the timeout.
+
+## Implementation
+
+### Real-world usefulness
+
+There is a very large number of transforms that *could* be implemented, but
+only a tiny fraction of them are useful for real-world code.
+
+Transforms that do not have real-world usefulness provide *negative* value to
+the LLVM project, by taking up valuable reviewer time, increasing code
+complexity and increasing compile-time overhead.
+
+We do not require explicit proof of real-world usefulness for every transform
+-- in most cases the usefulness is fairly "obvious". However, the question may
+come up for complex or unusual folds. Keep this in mind when chosing what you
+work on.
+
+In particular, fixes for fuzzer-generated missed optimization reports will
+likely be rejected if there is no evidence of real-world usefulness.
+
+### Pick the correct optimization pass
+
+There are a number of passes and utilities in the InstCombine family, and it
+is important to pick the right place when implementing a fold.
+
+ * `ConstantFolding`: For folding instructions with constant arguments to a constant. (Mainly relevant for intrinsics.)
+ * `ValueTracking`: For analyzing instructions, e.g. for known bits, non-zero, etc. Tests should usually use `-passes=instsimplify`.
+ * `InstructionSimplify`: For folds that do not create new instructions (either fold to existing value or constant).
+ * `InstCombine`: For folds that create or modify instructions.
+ * `AggressiveInstCombine`: For folds that are expensive, or violate InstCombine requirements.
+ * `VectorCombine`: For folds of vector operations that require target-dependent cost-modelling.
+
+Sometimes, folds that logically belong in InstSimplify are placed in InstCombine instead, for example because they are too expensive, or because they are structurally simpler to implement in InstCombine.
+
+For example, if a fold produces new instructions in some cases but returns an existing value in others, it may be preferable to keep all cases in InstCombine, rather than trying to split them among InstCombine and InstSimplify.
+
+### Canonicalization and target-independence
+
+InstCombine is a target-independent canonicalization pass. This means that it
+tries to bring IR into a "canonical form" that other optimizations (both inside
+and outside of InstCombine) can rely on. For this reason, the chosen canonical
+form needs to be the same for all targets, and not depend on target-specific
+cost modelling.
+
+In many cases, "canonicalization" and "optimization" coincide. For example, if
+we convert `x * 2` into `x << 1`, this both makes the IR more canonical
+(because there is now only one way to express the same operation, rather than
+two) and faster (because shifts will usually have lower latency than
+multiplies).
+
+However, there are also canonicalizations that don't serve any direct
+optimization purpose. For example, InstCombine will canonicalize non-strict
+predicates like `ule` to strict predicates like `ult`. `icmp ule i8 %x, 7`
+becomes `icmp ult i8 %x, 8`. This is not an optimization in any meaningful
+sense, but it does reduce the number of cases that other transforms need to
+handle.
+
+If some canonicalization is not profitable for a specific target, then a reverse
+transform needs to be added in the backend. Patches to disable specific
+InstCombine transforms on certain targets, or to drive them using
+target-specific cost-modelling, **will not be accepted**. The only permitted
+target-dependence is on DataLayout and TargetLibraryInfo.
+
+The use of TargetTransformInfo is only allowed for hooks for target-specific
+intrinsics, such as `TargetTransformInfo::instCombineIntrinsic()`. These are
+already inherently target-dependent anyway.
+
+For vector-specific transforms that require cost-modelling, the VectorCombine
+pass can be used instead. In very rare circumstances, if there are no other
+alternatives, target-dependent transforms may be accepted into
+AggressiveInstCombine.
+
+### PatternMatch
+
+Many transforms make use of the matching infrastructure defined in
+[PatternMatch.h](https://github.com/llvm/llvm-project/blame/main/llvm/include/llvm/IR/PatternMatch.h).
+
+Here is a typical usage example:
+
+```
+// Fold (A - B) + B and B + (A - B) to A.
+Value *A, *B;
+if (match(V, m_c_Add(m_Sub(m_Value(A), m_Value(B)), m_Deferred(B))))
+  return A;
+```
+
+And another:
+
+```
+// Fold A + C1 == C2 to A == C1+C2
+Value *A;
+if (match(V, m_ICmp(Pred, m_Add(m_Value(A), m_APInt(C1)), m_APInt(C2))) &&
+    ICmpInst::isEquality(Pred))
+  return Builder.CreateICmp(Pred, A,
+                            ConstantInt::get(A->getType(), *C1 + *C2));
+```
+
+Some common matchers are:
+
+ * `m_Value(A)`: Match any value and write it into `Value *A`.
+ * `m_Specific(A)`: Check that the operand equals A. Use this if A is
+   assigned **outside** the pattern.
+ * `m_Deferred(A)`: Check that the operand equals A. Use this if A is
+   assigned **inside** the pattern, for example via `m_Value(A)`.
+ * `m_APInt(C)`: Match a scalar integer constant or splat vector constant into
+   `const APInt *C`. Does not permit undef/poison values.
+ * `m_ImmConstant(C)`: Match any non-constant-expression constant into
+   `Constant *C`.
+ * `m_Constant(C)`: Match any constant into `Constant *C`. Don't use this unless
+   you know what you're doing.
+ * `m_Add(M1, M2)`, `m_Sub(M1, M2)`, etc: Match an add/sub/etc where the first
+   operand matches M1 and the second M2.
+ * `m_c_Add(M1, M2)`, etc: Match an add commutatively. The operands must match
+   either M1 and M2 or M2 and M1. Most instruction matchers have a commutative
+   variant.
+ * `m_ICmp(Pred, M1, M2)` and `m_c_ICmp(Pred, M1, M2)`: Match an icmp, writing
+   the predicate into `IcmpInst::Predicate Pred`. If the commutative version
+   is used, and the operands match in order M2, M1, then `Pred` will be the
+   swapped predicate.
+ * `m_OneUse(M)`: Check that the value only has one use, and also matches M.
+   For example `m_OneUse(m_Add(...))`. See the next section for more
+   information.
+
+See the header for the full list of available matchers.
+
+### InstCombine APIs
+
+InstCombine transforms are handled by `visitXYZ()` methods, where XYZ
+corresponds to the root instruction of your transform. If the outermost
+instruction of the pattern you are matching is an icmp, the fold will be
+located somewhere inside `visitICmpInst()`.
+
+The return value of the visit method is an instruction. You can either return
+a new instruction, in which case it will be inserted before the old one, and
+uses of the old one will be replaced by it. Or you can return the original
+instruction to indicate that *some* kind of change has been made. Finally, a
+nullptr return value indicates that no change occurred.
+
+For example, if your transform produces a single new icmp instruction, you could
+write the following:
+
+```
+if (...)
+  return new ICmpInst(Pred, X, Y);
+```
+
+In this case the main InstCombine loop takes care of inserting the instruction
+and replacing uses of the old instruction.
+
+Alternatively, you can also write it like this:
+
+```
+if (...)
+  return replaceInstUsesWith(OrigI, Builder.CreateICmp(Pred, X, Y));
+```
+
+In this case `IRBuilder` will insert the instruction and `replaceInstUsesWith()`
+will replace the uses of the old instruction, and return it to indicate that
+a change occurred.
+
+Both forms are equivalent, and you can use whichever is more convenient in
+context. For example, it's common that folds are inside helper functions that
+return `Value *` and then `replaceInstUsesWith()` is invoked on the result of
+that helper.
+
+InstCombine makes use of a worklist, which needs to be correctly updated during
+transforms. This usually happens automatically, but there are some things to
+keep in mind:
+
+  * Don't use the `Value::replaceAllUsesWith()` API. Use InstCombine's
+    `replaceInstUsesWith()` helper instead.
+  * Don't use the `Instruction::eraseFromParent()` API. Use InstCombine's
+    `eraseInstFromFunction()` helper instead. (Explicitly erasing instruction
+    is usually not necessary, as side-effect free instructions without users
+    are automatically removed.)
+  * Apart from the "directly return an instruction" pattern above, use IRBUilder
+    to create all instruction. Do not manually create and insert them.
+  * When replacing operands or uses of instructions, use `replaceOperand()`
+    and `replaceUse()` instead of `setOperand()`.
+
+### Multi-use handling
+
+Transforms should usually not increase the total number of instructions. This
+is not a hard requirement: For example, it is usually worthwhile to replace a
+single division instruction with multiple other instructions.
+
+For example, if you have a transform that replaces two instructions, with two
+other instructions, this is (usually) only profitable if *both* the original
+instructions can be removed. To ensure that both instructions are removed, you
+need to add a one-use check for the inner instruction.
+
+One-use checks can be performed using the `m_OneUse()` matcher, or the
+`V->hasOneUse()` method.
+
+### Generalization
+
+Transforms can both be too specific (only handling some odd subset of patterns,
+leading to unexpected optimization cliffs) and too general (introducing
+complexity to handle cases with no real-world relevance). The right level of
+generality is quite subjective, so this section only provides some broad
+guidelines.
+
+ * Avoid transforms that are hardcoded to specific constants. Try to figure
+   out what the general rule for arbitrary constants is.
+ * Add handling for conjugate patterns. For example, if you implement a fold
+   for `icmp eq`, you almost certainly also want to support `icmp ne`, with the
+   inverse result. Similarly, if you implement a pattern for `and` of `icmp`s,
+   you should also handle the de-Morgan conjugate using `or`.
+ * Handle non-splat vector constants if doing so is free, but do not add
+   handling for them if it adds any additional complexity to the code.
+ * Do not handle non-canonical patterns, unless there is a specific motivation
+   to do so. For example, it may sometimes be worthwhile to handle a pattern
+   that would normally be converted into a different canonical form, but can
+   still occur in multi-use scenarios. This is fine to do if there is specific
+   real-world motivation, but you should not go out of your way to do this
+   otherwise.
+ * Sometimes the motivating pattern uses a constant value with certain
+   properties, but the fold can be generalized to non-constant values by making
+   use of ValueTracking queries. Whether this makes sense depends on the case,
+   but it's usually a good idea to only handle the constant pattern first, and
+   then generalize later if it seems useful.
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 03691efe836f..01ecbdba5060 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -175,6 +175,10 @@ Changes to the LLVM tools
   ``--set-symbols-visibility`` options for ELF input to change the
   visibility of symbols.
 
+* llvm-objcopy now supports ``--skip-symbol`` and ``--skip-symbols`` options
+  for ELF input to skip the specified symbols when executing other options
+  that can change a symbol's name, binding or visibility.
+
 Changes to LLDB
 ---------------------------------
 
diff --git a/llvm/docs/UserGuides.rst b/llvm/docs/UserGuides.rst
index 155cb3361669..f40a04d414a2 100644
--- a/llvm/docs/UserGuides.rst
+++ b/llvm/docs/UserGuides.rst
@@ -43,6 +43,7 @@ intermediate LLVM representation.
    HowToCrossCompileBuiltinsOnArm
    HowToCrossCompileLLVM
    HowToUpdateDebugInfo
+   InstCombineContributorGuide
    InstrProfileFormat
    InstrRefDebugInfo
    LinkTimeOptimization
@@ -186,6 +187,10 @@ Optimizations
 :doc:`InstrProfileFormat`
    This document explains two binary formats of instrumentation-based profiles.
 
+:doc:`InstCombineContributorGuide`
+   This document specifies guidelines for contributions for InstCombine and
+   related passes.
+
 Code Generation
 ---------------
 
diff --git a/llvm/include/llvm/BinaryFormat/COFF.h b/llvm/include/llvm/BinaryFormat/COFF.h
index 72461d0d9c31..4c31cd847bdf 100644
--- a/llvm/include/llvm/BinaryFormat/COFF.h
+++ b/llvm/include/llvm/BinaryFormat/COFF.h
@@ -806,6 +806,12 @@ enum Feat00Flags : uint32_t {
   Kernel = 0x40000000,
 };
 
+enum class Arm64ECThunkType : uint8_t {
+  GuestExit = 0,
+  Entry = 1,
+  Exit = 4,
+};
+
 inline bool isReservedSectionNumber(int32_t SectionNumber) {
   return SectionNumber <= 0;
 }
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 5bb3692f0a46..284f434fbb9b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -429,6 +429,7 @@ public:
   LegalizeResult lowerDIVREM(MachineInstr &MI);
   LegalizeResult lowerAbsToAddXor(MachineInstr &MI);
   LegalizeResult lowerAbsToMaxNeg(MachineInstr &MI);
+  LegalizeResult lowerAbsToCNeg(MachineInstr &MI);
   LegalizeResult lowerVectorReduction(MachineInstr &MI);
   LegalizeResult lowerMemcpyInline(MachineInstr &MI);
   LegalizeResult lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen = 0);
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
index d368c7e0ece8..f1d4fc72d5a7 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
@@ -605,6 +605,9 @@ public:
     NameIndex(const DWARFDebugNames &Section, uint64_t Base)
         : Section(Section), Base(Base) {}
 
+    /// Returns Hdr field
+    Header getHeader() const { return Hdr; }
+
     /// Reads offset of compilation unit CU. CU is 0-based.
     uint64_t getCUOffset(uint32_t CU) const;
     uint32_t getCUCount() const { return Hdr.CompUnitCount; }
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 051e603c0819..fff03dee20a1 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2653,6 +2653,8 @@ class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
 // The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
 // The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers.
 // The content of the other 16-bit half is preserved from the input.
+
+defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX11 = {
 def int_amdgcn_wmma_f16_16x16x16_f16_tied   : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
 def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
 
@@ -2668,6 +2670,7 @@ def int_amdgcn_wmma_i32_16x16x16_iu4   : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, l
 // GFX12: The op_sel bit must be 0.
 def int_amdgcn_wmma_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
 def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
+}
 
 //===----------------------------------------------------------------------===//
 // GFX12 Intrinsics
@@ -2687,20 +2690,6 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
             [IntrNoMem, IntrConvergent, IntrWillReturn,
              ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
 
-
-// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
-//
-// These operations perform a matrix multiplication and accumulation of
-// the form: D = A * B + C .
-
-// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
-def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
-// A and B are <16 x iu4>.
-def int_amdgcn_wmma_i32_16x16x32_iu4     : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
-
 // SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics
 //
 // These operations perform a sparse matrix multiplication and accumulation of
@@ -2734,6 +2723,20 @@ class AMDGPUSWmmacIntrinsicIUIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType I
     [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>]
 >;
 
+defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX12 = {
+// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
+//
+// These operations perform a matrix multiplication and accumulation of
+// the form: D = A * B + C .
+
+// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
+def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+// A and B are <16 x iu4>.
+def int_amdgcn_wmma_i32_16x16x32_iu4     : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
+
 def int_amdgcn_swmmac_f32_16x16x32_f16     : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
 def int_amdgcn_swmmac_f32_16x16x32_bf16    : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
 def int_amdgcn_swmmac_f16_16x16x32_f16     : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
@@ -2745,6 +2748,7 @@ def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyin
 def int_amdgcn_swmmac_f32_16x16x32_fp8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
 def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
 def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+}
 
 def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn<llvm_i64_ty, global_ptr_ty>;
 
@@ -3012,6 +3016,7 @@ class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
             [IntrConvergent, IntrNoMem,
              ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
+defset list<Intrinsic> AMDGPUMFMAIntrinsics908 = {
 def int_amdgcn_mfma_f32_32x32x1f32  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>;
 def int_amdgcn_mfma_f32_16x16x1f32  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>;
 def int_amdgcn_mfma_f32_4x4x1f32    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_float_ty>;
@@ -3032,6 +3037,7 @@ def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v
 def int_amdgcn_mfma_f32_4x4x2bf16   : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2i16_ty>;
 def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>;
 def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2i16_ty>;
+}
 
 //===----------------------------------------------------------------------===//
 // gfx90a intrinsics
@@ -3043,6 +3049,7 @@ def int_amdgcn_flat_atomic_fadd   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 def int_amdgcn_flat_atomic_fmin   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 def int_amdgcn_flat_atomic_fmax   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 
+defset list<Intrinsic> AMDGPUMFMAIntrinsics90A = {
 def int_amdgcn_mfma_f32_32x32x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>;
 def int_amdgcn_mfma_f32_16x16x4bf16_1k  : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
 def int_amdgcn_mfma_f32_4x4x4bf16_1k    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty>;
@@ -3054,25 +3061,12 @@ def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  ll
 //       source operand.
 def int_amdgcn_mfma_f64_16x16x4f64      : AMDGPUMfmaIntrinsic<llvm_v4f64_ty,  llvm_double_ty>;
 def int_amdgcn_mfma_f64_4x4x4f64        : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>;
+}
 
 //===----------------------------------------------------------------------===//
 // gfx940 intrinsics
 // ===----------------------------------------------------------------------===//
 
-// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
-def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
-def int_amdgcn_flat_atomic_fadd_v2bf16   : AMDGPUAtomicRtn<llvm_v2i16_ty>;
-def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
-    [llvm_v2i16_ty],
-    [LLVMQualPointerType<3>, llvm_v2i16_ty],
-    [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>,
-    ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;
-
-def int_amdgcn_mfma_i32_16x16x32_i8     : AMDGPUMfmaIntrinsic<llvm_v4i32_ty,  llvm_i64_ty>;
-def int_amdgcn_mfma_i32_32x32x16_i8     : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;
-def int_amdgcn_mfma_f32_16x16x8_xf32    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2f32_ty>;
-def int_amdgcn_mfma_f32_32x32x4_xf32    : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>;
-
 class AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> :
   AMDGPUMfmaIntrinsic<DestTy, llvm_i64_ty>;
 
@@ -3081,9 +3075,6 @@ multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> {
     def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>;
 }
 
-defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>;
-defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>;
-
 // llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid
 class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> :
   ClangBuiltin<!subst("int", "__builtin", NAME)>,
@@ -3093,13 +3084,6 @@ class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> :
             [IntrConvergent, IntrNoMem,
              ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
 
-def int_amdgcn_smfmac_f32_16x16x32_f16  : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty,  llvm_v4f16_ty, llvm_v8f16_ty>;
-def int_amdgcn_smfmac_f32_32x32x16_f16  : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
-def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty, llvm_v8i16_ty>;
-def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
-def int_amdgcn_smfmac_i32_16x16x64_i8   : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty,  llvm_v2i32_ty, llvm_v4i32_ty>;
-def int_amdgcn_smfmac_i32_32x32x32_i8   : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
-
 class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> :
   AMDGPUMSmfmacIntrinsic<DestTy, llvm_v2i32_ty, llvm_v4i32_ty>;
 
@@ -3108,8 +3092,34 @@ multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> {
     def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>;
 }
 
+// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
+def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
+def int_amdgcn_flat_atomic_fadd_v2bf16   : AMDGPUAtomicRtn<llvm_v2i16_ty>;
+def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
+    [llvm_v2i16_ty],
+    [LLVMQualPointerType<3>, llvm_v2i16_ty],
+    [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>,
+    ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;
+
+defset list<Intrinsic> AMDGPUMFMAIntrinsics940 = {
+def int_amdgcn_mfma_i32_16x16x32_i8     : AMDGPUMfmaIntrinsic<llvm_v4i32_ty,  llvm_i64_ty>;
+def int_amdgcn_mfma_i32_32x32x16_i8     : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;
+def int_amdgcn_mfma_f32_16x16x8_xf32    : AMDGPUMfmaIntrinsic<llvm_v4f32_ty,  llvm_v2f32_ty>;
+def int_amdgcn_mfma_f32_32x32x4_xf32    : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>;
+
+defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>;
+defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>;
+
+def int_amdgcn_smfmac_f32_16x16x32_f16  : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty,  llvm_v4f16_ty, llvm_v8f16_ty>;
+def int_amdgcn_smfmac_f32_32x32x16_f16  : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
+def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty,  llvm_v4i16_ty, llvm_v8i16_ty>;
+def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
+def int_amdgcn_smfmac_i32_16x16x64_i8   : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty,  llvm_v2i32_ty, llvm_v4i32_ty>;
+def int_amdgcn_smfmac_i32_32x32x32_i8   : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
+
 defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic<llvm_v4f32_ty>;
 defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic<llvm_v16f32_ty>;
+}
 
 // llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3]
 // byte_sel selects byte from srcA.
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 382009d9df78..04feb1fd17fd 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -1319,6 +1319,26 @@ m_AddLike(const LHS &L, const RHS &R) {
   return m_CombineOr(m_Add(L, R), m_DisjointOr(L, R));
 }
 
+/// Match either "add nsw" or "or disjoint"
+template <typename LHS, typename RHS>
+inline match_combine_or<
+    OverflowingBinaryOp_match<LHS, RHS, Instruction::Add,
+                              OverflowingBinaryOperator::NoSignedWrap>,
+    DisjointOr_match<LHS, RHS>>
+m_NSWAddLike(const LHS &L, const RHS &R) {
+  return m_CombineOr(m_NSWAdd(L, R), m_DisjointOr(L, R));
+}
+
+/// Match either "add nuw" or "or disjoint"
+template <typename LHS, typename RHS>
+inline match_combine_or<
+    OverflowingBinaryOp_match<LHS, RHS, Instruction::Add,
+                              OverflowingBinaryOperator::NoUnsignedWrap>,
+    DisjointOr_match<LHS, RHS>>
+m_NUWAddLike(const LHS &L, const RHS &R) {
+  return m_CombineOr(m_NUWAdd(L, R), m_DisjointOr(L, R));
+}
+
 //===----------------------------------------------------------------------===//
 // Class that matches a group of binary opcodes.
 //
diff --git a/llvm/include/llvm/ObjCopy/CommonConfig.h b/llvm/include/llvm/ObjCopy/CommonConfig.h
index 8f69c9fbeaf5..9d6d5fb23b18 100644
--- a/llvm/include/llvm/ObjCopy/CommonConfig.h
+++ b/llvm/include/llvm/ObjCopy/CommonConfig.h
@@ -233,6 +233,7 @@ struct CommonConfig {
   NameMatcher UnneededSymbolsToRemove;
   NameMatcher SymbolsToWeaken;
   NameMatcher SymbolsToKeepGlobal;
+  NameMatcher SymbolsToSkip;
 
   // Map options
   StringMap<SectionRename> SectionsToRename;
diff --git a/llvm/include/llvm/Passes/TargetPassRegistry.inc b/llvm/include/llvm/Passes/TargetPassRegistry.inc
new file mode 100644
index 000000000000..b618331c6998
--- /dev/null
+++ b/llvm/include/llvm/Passes/TargetPassRegistry.inc
@@ -0,0 +1,194 @@
+//===- TargetPassRegistry.inc - Registry of passes --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is used as the registry of passes in registerPassBuilderCallbacks
+// Just put the following lines in the body of registerPassBuilderCallbacks:
+//  #define GET_PASS_REGISTRY "<Target>PassRegistry.def"
+//  #include "llvm/Passes/TargetPassRegistry.inc"
+//
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+#ifdef GET_PASS_REGISTRY
+
+#if !__has_include(GET_PASS_REGISTRY)
+#error "must provide <Target>PassRegistry.def"
+#endif
+
+if (PopulateClassToPassNames) {
+  auto *PIC = PB.getPassInstrumentationCallbacks();
+
+#define ADD_CLASS_PASS_TO_PASS_NAME(NAME, CREATE_PASS)                         \
+  PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
+#define ADD_CLASS_PASS_TO_PASS_NAME_WITH_PARAMS(NAME, CLASS)                   \
+  PIC->addClassToPassName(CLASS, NAME);
+
+#define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
+  ADD_CLASS_PASS_TO_PASS_NAME(NAME, CREATE_PASS)
+#define MODULE_PASS(NAME, CREATE_PASS)                                         \
+  ADD_CLASS_PASS_TO_PASS_NAME(NAME, CREATE_PASS)
+#define MODULE_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)      \
+  ADD_CLASS_PASS_TO_PASS_NAME_WITH_PARAMS(NAME, CLASS)
+#define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
+  ADD_CLASS_PASS_TO_PASS_NAME(NAME, CREATE_PASS)
+#define FUNCTION_ALIAS_ANALYSIS(NAME, CREATE_PASS)                             \
+  ADD_CLASS_PASS_TO_PASS_NAME(NAME, CREATE_PASS)
+#define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
+  ADD_CLASS_PASS_TO_PASS_NAME(NAME, CREATE_PASS)
+#define FUNCTION_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)    \
+  ADD_CLASS_PASS_TO_PASS_NAME_WITH_PARAMS(NAME, CLASS)
+#define LOOP_ANALYSIS(NAME, CREATE_PASS)                                       \
+  ADD_CLASS_PASS_TO_PASS_NAME(NAME, CREATE_PASS)
+#define LOOP_PASS(NAME, CREATE_PASS)                                           \
+  ADD_CLASS_PASS_TO_PASS_NAME(NAME, CREATE_PASS)
+#define MACHINE_FUNCTION_ANALYSIS(NAME, CREATE_PASS)                           \
+  ADD_CLASS_PASS_TO_PASS_NAME(NAME, CREATE_PASS)
+#define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)                               \
+  ADD_CLASS_PASS_TO_PASS_NAME(NAME, CREATE_PASS)
+#define MACHINE_FUNCTION_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER,    \
+                                          PARAMS)                              \
+  ADD_CLASS_PASS_TO_PASS_NAME_WITH_PARAMS(NAME, CLASS)
+#include GET_PASS_REGISTRY
+#undef MODULE_ANALYSIS
+#undef MODULE_PASS
+#undef MODULE_PASS_WITH_PARAMS
+#undef FUNCTION_ANALYSIS
+#undef FUNCTION_ALIAS_ANALYSIS
+#undef FUNCTION_PASS
+#undef FUNCTION_PASS_WITH_PARAMS
+#undef LOOP_ANALYSIS
+#undef LOOP_PASS
+#undef MACHINE_FUNCTION_ANALYSIS
+#undef MACHINE_FUNCTION_PASS
+#undef MACHINE_FUNCTION_PASS_WITH_PARAMS
+#undef ADD_CLASS_PASS_TO_PASS_NAME
+#undef ADD_CLASS_PASS_TO_PASS_NAME_WITH_PARAMS
+}
+
+#define ADD_PASS(NAME, CREATE_PASS)                                            \
+  if (Name == NAME) {                                                          \
+    PM.addPass(CREATE_PASS);                                                   \
+    return true;                                                               \
+  }
+
+#define ADD_PASS_WITH_PARAMS(NAME, CREATE_PASS, PARSER)                        \
+  if (PassBuilder::checkParametrizedPassName(Name, NAME)) {                    \
+    auto Params = PassBuilder::parsePassParameters(PARSER, Name, NAME);        \
+    if (!Params) {                                                             \
+      errs() << NAME ": " << toString(Params.takeError()) << '\n';             \
+      return false;                                                            \
+    }                                                                          \
+    PM.addPass(CREATE_PASS(Params.get()));                                     \
+    return true;                                                               \
+  }
+
+PB.registerPipelineParsingCallback([=](StringRef Name, ModulePassManager &PM,
+                                       ArrayRef<PassBuilder::PipelineElement>) {
+#define MODULE_PASS(NAME, CREATE_PASS) ADD_PASS(NAME, CREATE_PASS)
+#include GET_PASS_REGISTRY
+#undef MODULE_PASS
+  return false;
+});
+
+PB.registerPipelineParsingCallback([=](StringRef Name, ModulePassManager &PM,
+                                       ArrayRef<PassBuilder::PipelineElement>) {
+#define MODULE_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)      \
+  ADD_PASS_WITH_PARAMS(NAME, CREATE_PASS, PARSER)
+#include GET_PASS_REGISTRY
+#undef MODULE_PASS_WITH_PARAMS
+  return false;
+});
+
+PB.registerPipelineParsingCallback([=](StringRef Name, FunctionPassManager &PM,
+                                       ArrayRef<PassBuilder::PipelineElement>) {
+#define FUNCTION_PASS(NAME, CREATE_PASS) ADD_PASS(NAME, CREATE_PASS)
+#include GET_PASS_REGISTRY
+#undef FUNCTION_PASS
+  return false;
+});
+
+PB.registerPipelineParsingCallback([=](StringRef Name, FunctionPassManager &PM,
+                                       ArrayRef<PassBuilder::PipelineElement>) {
+#define FUNCTION_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)    \
+  ADD_PASS_WITH_PARAMS(NAME, CREATE_PASS, PARSER)
+#include GET_PASS_REGISTRY
+#undef FUNCTION_PASS_WITH_PARAMS
+  return false;
+});
+
+PB.registerPipelineParsingCallback([=](StringRef Name, LoopPassManager &PM,
+                                       ArrayRef<PassBuilder::PipelineElement>) {
+#define LOOP_PASS(NAME, CREATE_PASS) ADD_PASS(NAME, CREATE_PASS)
+#include GET_PASS_REGISTRY
+  return false;
+});
+
+PB.registerPipelineParsingCallback([=](StringRef Name,
+                                       MachineFunctionPassManager &PM,
+                                       ArrayRef<PassBuilder::PipelineElement>) {
+#define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS) ADD_PASS(NAME, CREATE_PASS)
+#include GET_PASS_REGISTRY
+  return false;
+});
+
+PB.registerPipelineParsingCallback([=](StringRef Name, FunctionPassManager &PM,
+                                       ArrayRef<PassBuilder::PipelineElement>) {
+#define MACHINE_FUNCTION_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER,    \
+                                          PARAMS)                              \
+  ADD_PASS_WITH_PARAMS(NAME, CREATE_PASS, PARSER)
+#include GET_PASS_REGISTRY
+#undef MACHINE_FUNCTION_PASS_WITH_PARAMS
+  return false;
+});
+
+#undef ADD_PASS
+#undef ADD_PASS_WITH_PARAMS
+
+PB.registerAnalysisRegistrationCallback([](ModuleAnalysisManager &AM) {
+#define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
+  AM.registerPass([&] { return CREATE_PASS; });
+#include GET_PASS_REGISTRY
+#undef MODULE_ANALYSIS
+});
+
+PB.registerAnalysisRegistrationCallback([](FunctionAnalysisManager &AM) {
+#define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
+  AM.registerPass([&] { return CREATE_PASS; });
+#include GET_PASS_REGISTRY
+#undef FUNCTION_ANALYSIS
+});
+
+PB.registerParseAACallback([](StringRef Name, AAManager &AM) {
+#define FUNCTION_ALIAS_ANALYSIS(NAME, CREATE_PASS)                             \
+  if (Name == NAME) {                                                          \
+    AM.registerFunctionAnalysis<                                               \
+        std::remove_reference_t<decltype(CREATE_PASS)>>();                     \
+    return true;                                                               \
+  }
+#include GET_PASS_REGISTRY
+#undef FUNCTION_ALIAS_ANALYSIS
+  return false;
+});
+
+PB.registerAnalysisRegistrationCallback([](LoopAnalysisManager &AM) {
+#define LOOP_ANALYSIS(NAME, CREATE_PASS)                                       \
+  AM.registerPass([&] { return CREATE_PASS; });
+#include GET_PASS_REGISTRY
+#undef LOOP_ANALYSIS
+});
+
+PB.registerAnalysisRegistrationCallback([](MachineFunctionAnalysisManager &AM) {
+#define MACHINE_FUNCTION_ANALYSIS(NAME, CREATE_PASS)                           \
+  AM.registerPass([&] { return CREATE_PASS; });
+#include GET_PASS_REGISTRY
+#undef MACHINE_FUNCTION_ANALYSIS
+});
+
+#undef GET_PASS_REGISTRY
+#endif // GET_PASS_REGISTRY
diff --git a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
index 0a0e16d2a9e6..158c358a9e46 100644
--- a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
+++ b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
@@ -84,6 +84,7 @@ bool isLifetimeIntrinsic(Value *V);
 Value *readRegister(IRBuilder<> &IRB, StringRef Name);
 Value *getFP(IRBuilder<> &IRB);
 Value *getPC(const Triple &TargetTriple, IRBuilder<> &IRB);
+Value *getAndroidSanitizerSlotPtr(IRBuilder<> &IRB);
 
 } // namespace memtag
 } // namespace llvm
diff --git a/llvm/lib/CodeGen/FinalizeISel.cpp b/llvm/lib/CodeGen/FinalizeISel.cpp
index 978355f8eb1b..bf967eac22f1 100644
--- a/llvm/lib/CodeGen/FinalizeISel.cpp
+++ b/llvm/lib/CodeGen/FinalizeISel.cpp
@@ -59,8 +59,7 @@ bool FinalizeISel::runOnMachineFunction(MachineFunction &MF) {
 
       // Set AdjustsStack to true if the instruction selector emits a stack
       // frame setup instruction or a stack aligning inlineasm.
-      if (MI.getOpcode() == TII->getCallFrameSetupOpcode() ||
-          MI.isStackAligningInlineAsm())
+      if (TII->isFrameInstr(MI) || MI.isStackAligningInlineAsm())
         MF.getFrameInfo().setAdjustsStack(true);
 
       // If MI is a pseudo, expand it.
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index d3f86af1e290..2a521b6b068a 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6945,10 +6945,6 @@ bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) {
   LLT DstTy = MRI.getType(Dst);
   LLT CarryTy = MRI.getType(Carry);
 
-  // We want do fold the [u|s]addo.
-  if (!MRI.hasOneNonDBGUse(Dst))
-    return false;
-
   // Fold addo, if the carry is dead -> add, undef.
   if (MRI.use_nodbg_empty(Carry) &&
       isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {DstTy}})) {
@@ -6959,10 +6955,6 @@ bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) {
     return true;
   }
 
-  // We want do fold the [u|s]addo.
-  if (!MRI.hasOneNonDBGUse(Carry))
-    return false;
-
   // Canonicalize constant to RHS.
   if (isConstantOrConstantVectorI(LHS) && !isConstantOrConstantVectorI(RHS)) {
     if (IsSigned) {
@@ -6994,7 +6986,7 @@ bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) {
     return true;
   }
 
-  // Fold (addo x, 0) -> x, no borrow
+  // Fold (addo x, 0) -> x, no carry
   if (MaybeRHS && *MaybeRHS == 0 && isConstantLegalOrBeforeLegalizer(CarryTy)) {
     MatchInfo = [=](MachineIRBuilder &B) {
       B.buildCopy(Dst, LHS);
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index abe23af00a78..8d608f6ac5e4 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -8215,9 +8215,22 @@ LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
   // %res = G_SMAX %a, %v2
   Register SrcReg = MI.getOperand(1).getReg();
   LLT Ty = MRI.getType(SrcReg);
+  auto Zero = MIRBuilder.buildConstant(Ty, 0);
+  auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg);
+  MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerAbsToCNeg(MachineInstr &MI) {
+  Register SrcReg = MI.getOperand(1).getReg();
+  Register DestReg = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(SrcReg), IType = LLT::scalar(1);
   auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
   auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
-  MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
+  auto ICmp = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, IType, SrcReg, Zero);
+  MIRBuilder.buildSelect(DestReg, ICmp, SrcReg, Sub);
   MI.eraseFromParent();
   return Legalized;
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f199625bf67a..7009f375df11 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3695,6 +3695,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
+  unsigned BitWidth = VT.getScalarSizeInBits();
   SDLoc DL(N);
 
   auto PeekThroughFreeze = [](SDValue N) {
@@ -3725,16 +3726,12 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
-  ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
-
   // fold (sub x, c) -> (add x, -c)
-  if (N1C) {
+  if (ConstantSDNode *N1C = getAsNonOpaqueConstant(N1))
     return DAG.getNode(ISD::ADD, DL, VT, N0,
                        DAG.getConstant(-N1C->getAPIntValue(), DL, VT));
-  }
 
   if (isNullOrNullSplat(N0)) {
-    unsigned BitWidth = VT.getScalarSizeInBits();
     // Right-shifting everything out but the sign bit followed by negation is
     // the same as flipping arithmetic/logical shift type without the negation:
     // -(X >>u 31) -> (X >>s 31)
@@ -3934,7 +3931,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
       SDValue S0 = N1.getOperand(0);
       if ((X0 == S0 && X1 == N1) || (X0 == N1 && X1 == S0))
         if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1)))
-          if (C->getAPIntValue() == (VT.getScalarSizeInBits() - 1))
+          if (C->getAPIntValue() == (BitWidth - 1))
             return DAG.getNode(ISD::ABS, SDLoc(N), VT, S0);
     }
   }
@@ -3977,8 +3974,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   if (!LegalOperations && N1.getOpcode() == ISD::SRL && N1.hasOneUse()) {
     SDValue ShAmt = N1.getOperand(1);
     ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt);
-    if (ShAmtC &&
-        ShAmtC->getAPIntValue() == (N1.getScalarValueSizeInBits() - 1)) {
+    if (ShAmtC && ShAmtC->getAPIntValue() == (BitWidth - 1)) {
       SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, N1.getOperand(0), ShAmt);
       return DAG.getNode(ISD::ADD, DL, VT, N0, SRA);
     }
@@ -3989,7 +3985,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   // N0 - (X << BW-1) --> N0 + (X << BW-1)
   if (N1.getOpcode() == ISD::SHL) {
     ConstantSDNode *ShlC = isConstOrConstSplat(N1.getOperand(1));
-    if (ShlC && ShlC->getAPIntValue() == VT.getScalarSizeInBits() - 1)
+    if (ShlC && ShlC->getAPIntValue() == (BitWidth - 1))
       return DAG.getNode(ISD::ADD, DL, VT, N1, N0);
   }
 
@@ -4022,23 +4018,17 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     }
   }
 
-  // max(a,b) - min(a,b) --> abd(a,b)
-  auto MatchSubMaxMin = [&](unsigned Max, unsigned Min, unsigned Abd) {
-    if (N0.getOpcode() != Max || N1.getOpcode() != Min)
-      return SDValue();
-    if ((N0.getOperand(0) != N1.getOperand(0) ||
-         N0.getOperand(1) != N1.getOperand(1)) &&
-        (N0.getOperand(0) != N1.getOperand(1) ||
-         N0.getOperand(1) != N1.getOperand(0)))
-      return SDValue();
-    if (!hasOperation(Abd, VT))
-      return SDValue();
-    return DAG.getNode(Abd, DL, VT, N0.getOperand(0), N0.getOperand(1));
-  };
-  if (SDValue R = MatchSubMaxMin(ISD::SMAX, ISD::SMIN, ISD::ABDS))
-    return R;
-  if (SDValue R = MatchSubMaxMin(ISD::UMAX, ISD::UMIN, ISD::ABDU))
-    return R;
+  // smax(a,b) - smin(a,b) --> abds(a,b)
+  if (hasOperation(ISD::ABDS, VT)  && 
+      sd_match(N0, m_SMax(m_Value(A), m_Value(B))) &&
+      sd_match(N1, m_SMin(m_Specific(A), m_Specific(B))))
+    return DAG.getNode(ISD::ABDS, DL, VT, A, B);
+
+  // umax(a,b) - umin(a,b) --> abdu(a,b)
+  if (hasOperation(ISD::ABDU, VT)  && 
+      sd_match(N0, m_UMax(m_Value(A), m_Value(B))) &&
+      sd_match(N1, m_UMin(m_Specific(A), m_Specific(B))))
+    return DAG.getNode(ISD::ABDU, DL, VT, A, B);
 
   return SDValue();
 }
@@ -4131,13 +4121,11 @@ SDValue DAGCombiner::visitSUBO(SDNode *N) {
     return CombineTo(N, DAG.getConstant(0, DL, VT),
                      DAG.getConstant(0, DL, CarryVT));
 
-  ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
-
   // fold (subox, c) -> (addo x, -c)
-  if (IsSigned && N1C && !N1C->isMinSignedValue()) {
-    return DAG.getNode(ISD::SADDO, DL, N->getVTList(), N0,
-                       DAG.getConstant(-N1C->getAPIntValue(), DL, VT));
-  }
+  if (ConstantSDNode *N1C = getAsNonOpaqueConstant(N1))
+    if (IsSigned && !N1C->isMinSignedValue())
+      return DAG.getNode(ISD::SADDO, DL, N->getVTList(), N0,
+                         DAG.getConstant(-N1C->getAPIntValue(), DL, VT));
 
   // fold (subo x, 0) -> x + no borrow
   if (isNullOrNullSplat(N1))
@@ -13844,11 +13832,20 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
         if (N0.getOpcode() == ISD::SHL) {
           // If the original shl may be shifting out bits, do not perform this
           // transformation.
-          // TODO: Add MaskedValueIsZero check.
           unsigned KnownZeroBits = ShVal.getValueSizeInBits() -
                                    ShVal.getOperand(0).getValueSizeInBits();
-          if (ShAmtC->getAPIntValue().ugt(KnownZeroBits))
-            return SDValue();
+          if (ShAmtC->getAPIntValue().ugt(KnownZeroBits)) {
+            // If the shift is too large, then see if we can deduce that the
+            // shift is safe anyway.
+            // Create a mask that has ones for the bits being shifted out.
+            APInt ShiftOutMask =
+                APInt::getHighBitsSet(ShVal.getValueSizeInBits(),
+                                      ShAmtC->getAPIntValue().getZExtValue());
+
+            // Check if the bits being shifted out are known to be zero.
+            if (!DAG.MaskedValueIsZero(ShVal, ShiftOutMask))
+              return SDValue();
+          }
         }
 
         // Ensure that the shift amount is wide enough for the shifted value.
@@ -23450,9 +23447,7 @@ static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
   SmallVector<SDValue, 8> Ops;
-
   EVT SVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits());
-  SDValue ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT);
 
   // Keep track of what we encounter.
   bool AnyInteger = false;
@@ -23462,7 +23457,7 @@ static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {
         !Op.getOperand(0).getValueType().isVector())
       Ops.push_back(Op.getOperand(0));
     else if (ISD::UNDEF == Op.getOpcode())
-      Ops.push_back(ScalarUndef);
+      Ops.push_back(DAG.getNode(ISD::UNDEF, DL, SVT));
     else
       return SDValue();
 
@@ -23482,13 +23477,12 @@ static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {
   // Replace UNDEFs by another scalar UNDEF node, of the final desired type.
   if (AnyFP) {
     SVT = EVT::getFloatingPointVT(OpVT.getSizeInBits());
-    ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT);
     if (AnyInteger) {
       for (SDValue &Op : Ops) {
         if (Op.getValueType() == SVT)
           continue;
         if (Op.isUndef())
-          Op = ScalarUndef;
+          Op = DAG.getNode(ISD::UNDEF, DL, SVT);
         else
           Op = DAG.getBitcast(SVT, Op);
       }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 9d73a42df2a4..cd6f083243d0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5042,8 +5042,9 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op,
 
   // If Op can't create undef/poison and none of its operands are undef/poison
   // then Op is never undef/poison.
-  // NOTE: TargetNodes should handle this in themselves in
-  // isGuaranteedNotToBeUndefOrPoisonForTargetNode.
+  // NOTE: TargetNodes can handle this in themselves in
+  // isGuaranteedNotToBeUndefOrPoisonForTargetNode or let
+  // TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode handle it.
   return !canCreateUndefOrPoison(Op, PoisonOnly, /*ConsiderFlags*/ true,
                                  Depth) &&
          all_of(Op->ops(), [&](SDValue V) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 57f8fc409de4..da29b1d5b312 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3786,7 +3786,15 @@ bool TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(
        Op.getOpcode() == ISD::INTRINSIC_VOID) &&
       "Should use isGuaranteedNotToBeUndefOrPoison if you don't know whether Op"
       " is a target node!");
-  return false;
+
+  // If Op can't create undef/poison and none of its operands are undef/poison
+  // then Op is never undef/poison.
+  return !canCreateUndefOrPoisonForTargetNode(Op, DemandedElts, DAG, PoisonOnly,
+                                              /*ConsiderFlags*/ true, Depth) &&
+         all_of(Op->ops(), [&](SDValue V) {
+           return DAG.isGuaranteedNotToBeUndefOrPoison(V, PoisonOnly,
+                                                       Depth + 1);
+         });
 }
 
 bool TargetLowering::canCreateUndefOrPoisonForTargetNode(
diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index 3394a1ec8dc4..59e7a9f5eb11 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -746,7 +746,7 @@ ConstantRange ConstantRange::castOp(Instruction::CastOps CastOp,
       Min = Min.zext(ResultBitWidth);
       Max = Max.zext(ResultBitWidth);
     }
-    return ConstantRange(std::move(Min), std::move(Max));
+    return getNonEmpty(std::move(Min), std::move(Max) + 1);
   }
   case Instruction::SIToFP: {
     // TODO: use input range if available
@@ -757,7 +757,7 @@ ConstantRange ConstantRange::castOp(Instruction::CastOps CastOp,
       SMin = SMin.sext(ResultBitWidth);
       SMax = SMax.sext(ResultBitWidth);
     }
-    return ConstantRange(std::move(SMin), std::move(SMax));
+    return getNonEmpty(std::move(SMin), std::move(SMax) + 1);
   }
   case Instruction::FPTrunc:
   case Instruction::FPExt:
diff --git a/llvm/lib/MC/MCDwarf.cpp b/llvm/lib/MC/MCDwarf.cpp
index d0face9140de..2ee0c3eb27b9 100644
--- a/llvm/lib/MC/MCDwarf.cpp
+++ b/llvm/lib/MC/MCDwarf.cpp
@@ -360,7 +360,12 @@ void MCDwarfLineStr::emitRef(MCStreamer *MCOS, StringRef Path) {
   size_t Offset = addString(Path);
   if (UseRelocs) {
     MCContext &Ctx = MCOS->getContext();
-    MCOS->emitValue(makeStartPlusIntExpr(Ctx, *LineStrLabel, Offset), RefSize);
+    if (Ctx.getAsmInfo()->needsDwarfSectionOffsetDirective()) {
+      MCOS->emitCOFFSecRel32(LineStrLabel, Offset);
+    } else {
+      MCOS->emitValue(makeStartPlusIntExpr(Ctx, *LineStrLabel, Offset),
+                      RefSize);
+    }
   } else
     MCOS->emitIntValue(Offset, RefSize);
 }
diff --git a/llvm/lib/ObjCopy/ConfigManager.cpp b/llvm/lib/ObjCopy/ConfigManager.cpp
index e46b595a56dc..6442f1b958fb 100644
--- a/llvm/lib/ObjCopy/ConfigManager.cpp
+++ b/llvm/lib/ObjCopy/ConfigManager.cpp
@@ -15,7 +15,7 @@ namespace objcopy {
 
 Expected<const COFFConfig &> ConfigManager::getCOFFConfig() const {
   if (!Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
-      !Common.SymbolsPrefixRemove.empty() ||
+      !Common.SymbolsPrefixRemove.empty() || !Common.SymbolsToSkip.empty() ||
       !Common.AllocSectionsPrefix.empty() || !Common.KeepSection.empty() ||
       !Common.SymbolsToGlobalize.empty() || !Common.SymbolsToKeep.empty() ||
       !Common.SymbolsToLocalize.empty() || !Common.SymbolsToWeaken.empty() ||
@@ -34,7 +34,7 @@ Expected<const COFFConfig &> ConfigManager::getCOFFConfig() const {
 
 Expected<const MachOConfig &> ConfigManager::getMachOConfig() const {
   if (!Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
-      !Common.SymbolsPrefixRemove.empty() ||
+      !Common.SymbolsPrefixRemove.empty() || !Common.SymbolsToSkip.empty() ||
       !Common.AllocSectionsPrefix.empty() || !Common.KeepSection.empty() ||
       !Common.SymbolsToGlobalize.empty() || !Common.SymbolsToKeep.empty() ||
       !Common.SymbolsToLocalize.empty() ||
@@ -56,7 +56,7 @@ Expected<const MachOConfig &> ConfigManager::getMachOConfig() const {
 Expected<const WasmConfig &> ConfigManager::getWasmConfig() const {
   if (!Common.AddGnuDebugLink.empty() || Common.ExtractPartition ||
       !Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
-      !Common.SymbolsPrefixRemove.empty() ||
+      !Common.SymbolsPrefixRemove.empty() || !Common.SymbolsToSkip.empty() ||
       !Common.AllocSectionsPrefix.empty() ||
       Common.DiscardMode != DiscardType::None || !Common.SymbolsToAdd.empty() ||
       !Common.SymbolsToGlobalize.empty() || !Common.SymbolsToLocalize.empty() ||
@@ -77,7 +77,7 @@ Expected<const WasmConfig &> ConfigManager::getWasmConfig() const {
 Expected<const XCOFFConfig &> ConfigManager::getXCOFFConfig() const {
   if (!Common.AddGnuDebugLink.empty() || Common.ExtractPartition ||
       !Common.SplitDWO.empty() || !Common.SymbolsPrefix.empty() ||
-      !Common.SymbolsPrefixRemove.empty() ||
+      !Common.SymbolsPrefixRemove.empty() || !Common.SymbolsToSkip.empty() ||
       !Common.AllocSectionsPrefix.empty() ||
       Common.DiscardMode != DiscardType::None || !Common.AddSection.empty() ||
       !Common.DumpSection.empty() || !Common.SymbolsToAdd.empty() ||
diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
index e4d6e02f3aa6..205bc1ef5b1a 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
@@ -291,6 +291,9 @@ static Error updateAndRemoveSymbols(const CommonConfig &Config,
     return Error::success();
 
   Obj.SymbolTable->updateSymbols([&](Symbol &Sym) {
+    if (Config.SymbolsToSkip.matches(Sym.Name))
+      return;
+
     // Common and undefined symbols don't make sense as local symbols, and can
     // even cause crashes if we localize those, so skip them.
     if (!Sym.isCommon() && Sym.getShndx() != SHN_UNDEF &&
diff --git a/llvm/lib/Object/COFFImportFile.cpp b/llvm/lib/Object/COFFImportFile.cpp
index 46c8e702581e..8224a1492502 100644
--- a/llvm/lib/Object/COFFImportFile.cpp
+++ b/llvm/lib/Object/COFFImportFile.cpp
@@ -626,8 +626,11 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path,
                          MachineTypes Machine, bool MinGW,
                          ArrayRef<COFFShortExport> NativeExports) {
 
-  MachineTypes NativeMachine =
-      isArm64EC(Machine) ? IMAGE_FILE_MACHINE_ARM64 : Machine;
+  MachineTypes NativeMachine = Machine;
+  if (isArm64EC(Machine)) {
+    NativeMachine = IMAGE_FILE_MACHINE_ARM64;
+    Machine = IMAGE_FILE_MACHINE_ARM64EC;
+  }
 
   std::vector<NewArchiveMember> Members;
   ObjectFactory OF(llvm::sys::path::filename(ImportName), NativeMachine);
diff --git a/llvm/lib/ProfileData/RawMemProfReader.cpp b/llvm/lib/ProfileData/RawMemProfReader.cpp
index 0e2b8668bab7..60c37c417aa0 100644
--- a/llvm/lib/ProfileData/RawMemProfReader.cpp
+++ b/llvm/lib/ProfileData/RawMemProfReader.cpp
@@ -127,6 +127,7 @@ CallStackMap readStackInfo(const char *Ptr) {
         endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
 
     SmallVector<uint64_t> CallStack;
+    CallStack.reserve(NumPCs);
     for (uint64_t J = 0; J < NumPCs; J++) {
       CallStack.push_back(
           endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr));
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index f147ded2ab70..9b5cbe3de137 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -30,6 +30,7 @@
 #include "llvm/TargetParser/Triple.h"
 
 using namespace llvm;
+using namespace llvm::COFF;
 using namespace llvm::object;
 
 using OperandBundleDef = OperandBundleDefT<Value *>;
@@ -45,8 +46,6 @@ static cl::opt<bool> GenerateThunks("arm64ec-generate-thunks", cl::Hidden,
 
 namespace {
 
-enum class ThunkType { GuestExit, Entry, Exit };
-
 class AArch64Arm64ECCallLowering : public ModulePass {
 public:
   static char ID;
@@ -73,15 +72,15 @@ private:
   Type *I64Ty;
   Type *VoidTy;
 
-  void getThunkType(FunctionType *FT, AttributeList AttrList, ThunkType TT,
-                    raw_ostream &Out, FunctionType *&Arm64Ty,
-                    FunctionType *&X64Ty);
+  void getThunkType(FunctionType *FT, AttributeList AttrList,
+                    Arm64ECThunkType TT, raw_ostream &Out,
+                    FunctionType *&Arm64Ty, FunctionType *&X64Ty);
   void getThunkRetType(FunctionType *FT, AttributeList AttrList,
                        raw_ostream &Out, Type *&Arm64RetTy, Type *&X64RetTy,
                        SmallVectorImpl<Type *> &Arm64ArgTypes,
                        SmallVectorImpl<Type *> &X64ArgTypes, bool &HasSretPtr);
-  void getThunkArgTypes(FunctionType *FT, AttributeList AttrList, ThunkType TT,
-                        raw_ostream &Out,
+  void getThunkArgTypes(FunctionType *FT, AttributeList AttrList,
+                        Arm64ECThunkType TT, raw_ostream &Out,
                         SmallVectorImpl<Type *> &Arm64ArgTypes,
                         SmallVectorImpl<Type *> &X64ArgTypes, bool HasSretPtr);
   void canonicalizeThunkType(Type *T, Align Alignment, bool Ret,
@@ -91,13 +90,11 @@ private:
 
 } // end anonymous namespace
 
-void AArch64Arm64ECCallLowering::getThunkType(FunctionType *FT,
-                                              AttributeList AttrList,
-                                              ThunkType TT, raw_ostream &Out,
-                                              FunctionType *&Arm64Ty,
-                                              FunctionType *&X64Ty) {
-  Out << (TT == ThunkType::Entry ? "$ientry_thunk$cdecl$"
-                                 : "$iexit_thunk$cdecl$");
+void AArch64Arm64ECCallLowering::getThunkType(
+    FunctionType *FT, AttributeList AttrList, Arm64ECThunkType TT,
+    raw_ostream &Out, FunctionType *&Arm64Ty, FunctionType *&X64Ty) {
+  Out << (TT == Arm64ECThunkType::Entry ? "$ientry_thunk$cdecl$"
+                                        : "$iexit_thunk$cdecl$");
 
   Type *Arm64RetTy;
   Type *X64RetTy;
@@ -108,7 +105,7 @@ void AArch64Arm64ECCallLowering::getThunkType(FunctionType *FT,
   // The first argument to a thunk is the called function, stored in x9.
   // For exit thunks, we pass the called function down to the emulator;
   // for entry/guest exit thunks, we just call the Arm64 function directly.
-  if (TT == ThunkType::Exit)
+  if (TT == Arm64ECThunkType::Exit)
     Arm64ArgTypes.push_back(PtrTy);
   X64ArgTypes.push_back(PtrTy);
 
@@ -125,8 +122,8 @@ void AArch64Arm64ECCallLowering::getThunkType(FunctionType *FT,
 }
 
 void AArch64Arm64ECCallLowering::getThunkArgTypes(
-    FunctionType *FT, AttributeList AttrList, ThunkType TT, raw_ostream &Out,
-    SmallVectorImpl<Type *> &Arm64ArgTypes,
+    FunctionType *FT, AttributeList AttrList, Arm64ECThunkType TT,
+    raw_ostream &Out, SmallVectorImpl<Type *> &Arm64ArgTypes,
     SmallVectorImpl<Type *> &X64ArgTypes, bool HasSretPtr) {
 
   Out << "$";
@@ -163,7 +160,7 @@ void AArch64Arm64ECCallLowering::getThunkArgTypes(
     X64ArgTypes.push_back(PtrTy);
     // x5
     Arm64ArgTypes.push_back(I64Ty);
-    if (TT != ThunkType::Entry) {
+    if (TT != Arm64ECThunkType::Entry) {
       // FIXME: x5 isn't actually used by the x64 side; revisit once we
       // have proper isel for varargs
       X64ArgTypes.push_back(I64Ty);
@@ -348,7 +345,8 @@ Function *AArch64Arm64ECCallLowering::buildExitThunk(FunctionType *FT,
   SmallString<256> ExitThunkName;
   llvm::raw_svector_ostream ExitThunkStream(ExitThunkName);
   FunctionType *Arm64Ty, *X64Ty;
-  getThunkType(FT, Attrs, ThunkType::Exit, ExitThunkStream, Arm64Ty, X64Ty);
+  getThunkType(FT, Attrs, Arm64ECThunkType::Exit, ExitThunkStream, Arm64Ty,
+               X64Ty);
   if (Function *F = M->getFunction(ExitThunkName))
     return F;
 
@@ -451,8 +449,8 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
   SmallString<256> EntryThunkName;
   llvm::raw_svector_ostream EntryThunkStream(EntryThunkName);
   FunctionType *Arm64Ty, *X64Ty;
-  getThunkType(F->getFunctionType(), F->getAttributes(), ThunkType::Entry,
-               EntryThunkStream, Arm64Ty, X64Ty);
+  getThunkType(F->getFunctionType(), F->getAttributes(),
+               Arm64ECThunkType::Entry, EntryThunkStream, Arm64Ty, X64Ty);
   if (Function *F = M->getFunction(EntryThunkName))
     return F;
 
@@ -543,8 +541,8 @@ Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
 Function *AArch64Arm64ECCallLowering::buildGuestExitThunk(Function *F) {
   llvm::raw_null_ostream NullThunkName;
   FunctionType *Arm64Ty, *X64Ty;
-  getThunkType(F->getFunctionType(), F->getAttributes(), ThunkType::GuestExit,
-               NullThunkName, Arm64Ty, X64Ty);
+  getThunkType(F->getFunctionType(), F->getAttributes(),
+               Arm64ECThunkType::GuestExit, NullThunkName, Arm64Ty, X64Ty);
   auto MangledName = getArm64ECMangledFunctionName(F->getName().str());
   assert(MangledName && "Can't guest exit to function that's already native");
   std::string ThunkName = *MangledName;
@@ -679,7 +677,7 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) {
   struct ThunkInfo {
     Constant *Src;
     Constant *Dst;
-    unsigned Kind;
+    Arm64ECThunkType Kind;
   };
   SmallVector<ThunkInfo> ThunkMapping;
   for (Function &F : Mod) {
@@ -688,14 +686,17 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) {
         F.getCallingConv() != CallingConv::ARM64EC_Thunk_X64) {
       if (!F.hasComdat())
         F.setComdat(Mod.getOrInsertComdat(F.getName()));
-      ThunkMapping.push_back({&F, buildEntryThunk(&F), 1});
+      ThunkMapping.push_back(
+          {&F, buildEntryThunk(&F), Arm64ECThunkType::Entry});
     }
   }
   for (Function *F : DirectCalledFns) {
     ThunkMapping.push_back(
-        {F, buildExitThunk(F->getFunctionType(), F->getAttributes()), 4});
+        {F, buildExitThunk(F->getFunctionType(), F->getAttributes()),
+         Arm64ECThunkType::Exit});
     if (!F->hasDLLImportStorageClass())
-      ThunkMapping.push_back({buildGuestExitThunk(F), F, 0});
+      ThunkMapping.push_back(
+          {buildGuestExitThunk(F), F, Arm64ECThunkType::GuestExit});
   }
 
   if (!ThunkMapping.empty()) {
@@ -704,7 +705,7 @@ bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) {
       ThunkMappingArrayElems.push_back(ConstantStruct::getAnon(
           {ConstantExpr::getBitCast(Thunk.Src, PtrTy),
            ConstantExpr::getBitCast(Thunk.Dst, PtrTy),
-           ConstantInt::get(M->getContext(), APInt(32, Thunk.Kind))}));
+           ConstantInt::get(M->getContext(), APInt(32, uint8_t(Thunk.Kind)))}));
     }
     Constant *ThunkMappingArray = ConstantArray::get(
         llvm::ArrayType::get(ThunkMappingArrayElems[0]->getType(),
diff --git a/llvm/lib/Target/AArch64/AArch64PassRegistry.def b/llvm/lib/Target/AArch64/AArch64PassRegistry.def
new file mode 100644
index 000000000000..ca944579f93a
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64PassRegistry.def
@@ -0,0 +1,20 @@
+//===- AArch64PassRegistry.def - Registry of AArch64 passes -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is used as the registry of passes that are part of the
+// AArch64 backend.
+//
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+#ifndef LOOP_PASS
+#define LOOP_PASS(NAME, CREATE_PASS)
+#endif
+LOOP_PASS("aarch64-lit", AArch64LoopIdiomTransformPass())
+#undef LOOP_PASS
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index e5e60459e814..08238fdf167b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -547,6 +547,10 @@ public:
 
 void AArch64TargetMachine::registerPassBuilderCallbacks(
     PassBuilder &PB, bool PopulateClassToPassNames) {
+
+#define GET_PASS_REGISTRY "AArch64PassRegistry.def"
+#include "llvm/Passes/TargetPassRegistry.inc"
+
   PB.registerLateLoopOptimizationsEPCallback(
       [=](LoopPassManager &LPM, OptimizationLevel Level) {
         LPM.addPass(AArch64LoopIdiomTransformPass());
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 996abe8e4739..34e2c1d9c8e2 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1012,6 +1012,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
     ABSActions
         .legalFor({s32, s64});
   ABSActions.legalFor(PackedVectorAllTypeList)
+      .customIf([=](const LegalityQuery &Q) {
+        // TODO: Fix suboptimal codegen for 128+ bit types.
+        LLT SrcTy = Q.Types[0];
+        return SrcTy.isScalar() && SrcTy.getSizeInBits() < 128;
+      })
       .widenScalarIf(
           [=](const LegalityQuery &Query) { return Query.Types[0] == v4s8; },
           [=](const LegalityQuery &Query) { return std::make_pair(0, v4s16); })
@@ -1264,6 +1269,8 @@ bool AArch64LegalizerInfo::legalizeCustom(
     return legalizeDynStackAlloc(MI, Helper);
   case TargetOpcode::G_PREFETCH:
     return legalizePrefetch(MI, Helper);
+  case TargetOpcode::G_ABS:
+    return Helper.lowerAbsToCNeg(MI);
   }
 
   llvm_unreachable("expected switch to return");
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index d7f5110427ec..9bd30458bc0a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -918,6 +918,96 @@ AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
   llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
 }
 
+static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {
+  for (const auto &CI : IA->ParseConstraints()) {
+    for (StringRef Code : CI.Codes) {
+      Code.consume_front("{");
+      if (Code.starts_with("a"))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+struct AAAMDGPUNoAGPR
+    : public IRAttribute<Attribute::NoUnwind,
+                         StateWrapper<BooleanState, AbstractAttribute>,
+                         AAAMDGPUNoAGPR> {
+  AAAMDGPUNoAGPR(const IRPosition &IRP, Attributor &A) : IRAttribute(IRP) {}
+
+  static AAAMDGPUNoAGPR &createForPosition(const IRPosition &IRP,
+                                           Attributor &A) {
+    if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
+      return *new (A.Allocator) AAAMDGPUNoAGPR(IRP, A);
+    llvm_unreachable("AAAMDGPUNoAGPR is only valid for function position");
+  }
+
+  void initialize(Attributor &A) override {
+    Function *F = getAssociatedFunction();
+    if (F->hasFnAttribute("amdgpu-no-agpr"))
+      indicateOptimisticFixpoint();
+  }
+
+  const std::string getAsStr(Attributor *A) const override {
+    return getAssumed() ? "amdgpu-no-agpr" : "amdgpu-maybe-agpr";
+  }
+
+  void trackStatistics() const override {}
+
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Use AACallEdges, but then we need a way to inspect asm edges.
+
+    auto CheckForNoAGPRs = [&](Instruction &I) {
+      const auto &CB = cast<CallBase>(I);
+      const Value *CalleeOp = CB.getCalledOperand();
+      const Function *Callee = dyn_cast<Function>(CalleeOp);
+      if (!Callee) {
+        if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp))
+          return !inlineAsmUsesAGPRs(IA);
+        return false;
+      }
+
+      // Some intrinsics may use AGPRs, but if we have a choice, we are not
+      // required to use AGPRs.
+      if (Callee->isIntrinsic())
+        return true;
+
+      // TODO: Handle callsite attributes
+      const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>(
+          *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
+      return CalleeInfo && CalleeInfo->getAssumed();
+    };
+
+    bool UsedAssumedInformation = false;
+    if (!A.checkForAllCallLikeInstructions(CheckForNoAGPRs, *this,
+                                           UsedAssumedInformation))
+      return indicatePessimisticFixpoint();
+    return ChangeStatus::UNCHANGED;
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    if (!getAssumed())
+      return ChangeStatus::UNCHANGED;
+    LLVMContext &Ctx = getAssociatedFunction()->getContext();
+    return A.manifestAttrs(getIRPosition(),
+                           {Attribute::get(Ctx, "amdgpu-no-agpr")});
+  }
+
+  const std::string getName() const override { return "AAAMDGPUNoAGPR"; }
+  const char *getIdAddr() const override { return &ID; }
+
+  /// This function should return true if the type of the \p AA is
+  /// AAAMDGPUNoAGPRs
+  static bool classof(const AbstractAttribute *AA) {
+    return (AA->getIdAddr() == &ID);
+  }
+
+  static const char ID;
+};
+
+const char AAAMDGPUNoAGPR::ID = 0;
+
 static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
   for (unsigned I = 0;
@@ -946,8 +1036,9 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
   DenseSet<const char *> Allowed(
       {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
        &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
-       &AAAMDWavesPerEU::ID, &AACallEdges::ID, &AAPointerInfo::ID,
-       &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID});
+       &AAAMDWavesPerEU::ID, &AAAMDGPUNoAGPR::ID, &AACallEdges::ID,
+       &AAPointerInfo::ID, &AAPotentialConstantValues::ID,
+       &AAUnderlyingObjects::ID});
 
   AttributorConfig AC(CGUpdater);
   AC.Allowed = &Allowed;
@@ -963,6 +1054,7 @@ static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
     if (!F.isIntrinsic()) {
       A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
       A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));
+      A.getOrCreateAAFor<AAAMDGPUNoAGPR>(IRPosition::function(F));
       CallingConv::ID CC = F.getCallingConv();
       if (!AMDGPU::isEntryFunctionCC(CC)) {
         A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 7e1f041fa109..8969b3b5b6ce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -715,10 +715,13 @@ bool AMDGPUCallLowering::lowerFormalArguments(
   if (!IsEntryFunc && !IsGraphics) {
     // For the fixed ABI, pass workitem IDs in the last argument register.
     TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
+  }
 
+  if (!IsEntryFunc) {
     if (!Subtarget.enableFlatScratch())
       CCInfo.AllocateReg(Info->getScratchRSrcReg());
-    TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
+    if (!IsGraphics)
+      TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
   }
 
   IncomingValueAssigner Assigner(AssignFn);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 4be64629ddac..9bd0fd7eca76 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -23,6 +23,7 @@ def CC_SI_Gfx : CallingConv<[
   // 33 is reserved for the frame pointer
   // 34 is reserved for the base pointer
   CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<[
+    SGPR0, SGPR1, SGPR2, SGPR3,
     SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
     SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index b85cb26fdc95..595f09664c55 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -340,26 +340,11 @@ public:
 
     // Get uses from the current function, excluding uses by called functions
     // Two output variables to avoid walking the globals list twice
-    std::optional<bool> HasAbsoluteGVs;
     for (auto &GV : M.globals()) {
       if (!AMDGPU::isLDSVariableToLower(GV)) {
         continue;
       }
 
-      // Check if the module is consistent: either all GVs are absolute (happens
-      // when we run the pass more than once), or none are.
-      const bool IsAbsolute = GV.isAbsoluteSymbolRef();
-      if (HasAbsoluteGVs.has_value()) {
-        if (*HasAbsoluteGVs != IsAbsolute) {
-          report_fatal_error(
-              "Module cannot mix absolute and non-absolute LDS GVs");
-        }
-      } else
-        HasAbsoluteGVs = IsAbsolute;
-
-      if (IsAbsolute)
-        continue;
-
       for (User *V : GV.users()) {
         if (auto *I = dyn_cast<Instruction>(V)) {
           Function *F = I->getFunction();
@@ -469,6 +454,31 @@ public:
       }
     }
 
+    // Verify that we fall into one of 2 cases:
+    //    - All variables are absolute: this is a re-run of the pass
+    //      so we don't have anything to do.
+    //    - No variables are absolute.
+    std::optional<bool> HasAbsoluteGVs;
+    for (auto &Map : {direct_map_kernel, indirect_map_kernel}) {
+      for (auto &[Fn, GVs] : Map) {
+        for (auto *GV : GVs) {
+          bool IsAbsolute = GV->isAbsoluteSymbolRef();
+          if (HasAbsoluteGVs.has_value()) {
+            if (*HasAbsoluteGVs != IsAbsolute) {
+              report_fatal_error(
+                  "Module cannot mix absolute and non-absolute LDS GVs");
+            }
+          } else
+            HasAbsoluteGVs = IsAbsolute;
+        }
+      }
+    }
+
+    // If we only had absolute GVs, we have nothing to do, return an empty
+    // result.
+    if (HasAbsoluteGVs && *HasAbsoluteGVs)
+      return {FunctionVariableMap(), FunctionVariableMap()};
+
     return {std::move(direct_map_kernel), std::move(indirect_map_kernel)};
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index bb1c6b733729..8eb46a980148 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -354,82 +354,17 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
 def : SourceOfDivergence<int_amdgcn_update_dpp>;
 def : SourceOfDivergence<int_amdgcn_writelane>;
 
-def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x4f16>;
-def : SourceOfDivergence<int_amdgcn_mfma_i32_4x4x4i8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x2bf16>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x1f32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f16>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16f16>;
-def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x4i8>;
-def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x16i8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x2bf16>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8bf16>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x1f32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2f32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4f16>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8f16>;
-def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x4i8>;
-def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x8i8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2bf16>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16_1k>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4bf16_1k>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x4bf16_1k>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8bf16_1k>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16bf16_1k>;
-def : SourceOfDivergence<int_amdgcn_mfma_f64_16x16x4f64>;
-def : SourceOfDivergence<int_amdgcn_mfma_f64_4x4x4f64>;
-def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x32_i8>;
-def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x16_i8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8_xf32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4_xf32>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_bf8_bf8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_bf8_fp8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_fp8_bf8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_fp8_fp8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_bf8_bf8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_bf8_fp8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_fp8_bf8>;
-def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_fp8_fp8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_f16>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_f16>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_bf16>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_bf16>;
-def : SourceOfDivergence<int_amdgcn_smfmac_i32_16x16x64_i8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_i32_32x32x32_i8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_bf8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_fp8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_bf8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_fp8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_bf8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_fp8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_bf8>;
-def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_fp8>;
-def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_f16>;
-def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf16>;
-def : SourceOfDivergence<int_amdgcn_wmma_f16_16x16x16_f16>;
-def : SourceOfDivergence<int_amdgcn_wmma_bf16_16x16x16_bf16>;
-def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu8>;
-def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu4>;
-def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_fp8>;
-def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_bf8>;
-def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_fp8>;
-def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_bf8>;
-def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x32_iu4>;
-def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_f16>;
-def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf16>;
-def : SourceOfDivergence<int_amdgcn_swmmac_f16_16x16x32_f16>;
-def : SourceOfDivergence<int_amdgcn_swmmac_bf16_16x16x32_bf16>;
-def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu8>;
-def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu4>;
-def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x64_iu4>;
-def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_fp8>;
-def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_bf8>;
-def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_fp8>;
-def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_bf8>;
+foreach intr = AMDGPUMFMAIntrinsics908 in
+def : SourceOfDivergence<intr>;
+foreach intr = AMDGPUMFMAIntrinsics90A in
+def : SourceOfDivergence<intr>;
+foreach intr = AMDGPUMFMAIntrinsics940 in
+def : SourceOfDivergence<intr>;
+foreach intr = AMDGPUWMMAIntrinsicsGFX11 in
+def : SourceOfDivergence<intr>;
+foreach intr = AMDGPUWMMAIntrinsicsGFX12 in
+def : SourceOfDivergence<intr>;
+
 def : SourceOfDivergence<int_amdgcn_global_load_tr>;
 
 // The dummy boolean output is divergent from the IR's perspective,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 2b457fe519d9..c96625092a76 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -793,6 +793,15 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
 
         PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
       });
+
+  PB.registerFullLinkTimeOptimizationLastEPCallback(
+      [this](ModulePassManager &PM, OptimizationLevel Level) {
+        // We want to support the -lto-partitions=N option as "best effort".
+        // For that, we need to lower LDS earlier in the pipeline before the
+        // module is partitioned for codegen.
+        if (EnableLowerModuleLDS)
+          PM.addPass(AMDGPULowerModuleLDSPass(*this));
+      });
 }
 
 int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5ccf21f76015..7f0cff72c186 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2857,12 +2857,13 @@ SDValue SITargetLowering::LowerFormalArguments(
   } else if (!IsGraphics) {
     // For the fixed ABI, pass workitem IDs in the last argument register.
     allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
+  }
 
-    // FIXME: Sink this into allocateSpecialInputSGPRs
+  if (!IsEntryFunc) {
     if (!Subtarget->enableFlatScratch())
       CCInfo.AllocateReg(Info->getScratchRSrcReg());
-
-    allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
+    if (!IsGraphics)
+      allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
   }
 
   if (!IsKernel) {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 2569f40fec0e..12433dc83c48 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -748,35 +748,7 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
 }
 
 bool SIMachineFunctionInfo::mayUseAGPRs(const Function &F) const {
-  for (const BasicBlock &BB : F) {
-    for (const Instruction &I : BB) {
-      const auto *CB = dyn_cast<CallBase>(&I);
-      if (!CB)
-        continue;
-
-      if (CB->isInlineAsm()) {
-        const InlineAsm *IA = dyn_cast<InlineAsm>(CB->getCalledOperand());
-        for (const auto &CI : IA->ParseConstraints()) {
-          for (StringRef Code : CI.Codes) {
-            Code.consume_front("{");
-            if (Code.starts_with("a"))
-              return true;
-          }
-        }
-        continue;
-      }
-
-      const Function *Callee =
-          dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
-      if (!Callee)
-        return true;
-
-      if (Callee->getIntrinsicID() == Intrinsic::not_intrinsic)
-        return true;
-    }
-  }
-
-  return false;
+  return !F.hasFnAttribute("amdgpu-no-agpr");
 }
 
 bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp b/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp
index fddbaa97d063..2089f5dda6fe 100644
--- a/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp
@@ -47,10 +47,13 @@ public:
   StringRef getPassName() const override { return "RISC-V Fold Masks"; }
 
 private:
-  bool convertToUnmasked(MachineInstr &MI, MachineInstr *MaskDef) const;
-  bool convertVMergeToVMv(MachineInstr &MI, MachineInstr *MaskDef) const;
+  bool convertToUnmasked(MachineInstr &MI) const;
+  bool convertVMergeToVMv(MachineInstr &MI) const;
 
-  bool isAllOnesMask(MachineInstr *MaskDef) const;
+  bool isAllOnesMask(const MachineInstr *MaskDef) const;
+
+  /// Maps uses of V0 to the corresponding def of V0.
+  DenseMap<const MachineInstr *, const MachineInstr *> V0Defs;
 };
 
 } // namespace
@@ -59,10 +62,9 @@ char RISCVFoldMasks::ID = 0;
 
 INITIALIZE_PASS(RISCVFoldMasks, DEBUG_TYPE, "RISC-V Fold Masks", false, false)
 
-bool RISCVFoldMasks::isAllOnesMask(MachineInstr *MaskDef) const {
-  if (!MaskDef)
-    return false;
-  assert(MaskDef->isCopy() && MaskDef->getOperand(0).getReg() == RISCV::V0);
+bool RISCVFoldMasks::isAllOnesMask(const MachineInstr *MaskDef) const {
+  assert(MaskDef && MaskDef->isCopy() &&
+         MaskDef->getOperand(0).getReg() == RISCV::V0);
   Register SrcReg = TRI->lookThruCopyLike(MaskDef->getOperand(1).getReg(), MRI);
   if (!SrcReg.isVirtual())
     return false;
@@ -89,8 +91,7 @@ bool RISCVFoldMasks::isAllOnesMask(MachineInstr *MaskDef) const {
 
 // Transform (VMERGE_VVM_<LMUL> false, false, true, allones, vl, sew) to
 // (VMV_V_V_<LMUL> false, true, vl, sew). It may decrease uses of VMSET.
-bool RISCVFoldMasks::convertVMergeToVMv(MachineInstr &MI,
-                                        MachineInstr *V0Def) const {
+bool RISCVFoldMasks::convertVMergeToVMv(MachineInstr &MI) const {
 #define CASE_VMERGE_TO_VMV(lmul)                                               \
   case RISCV::PseudoVMERGE_VVM_##lmul:                                         \
     NewOpc = RISCV::PseudoVMV_V_V_##lmul;                                      \
@@ -116,7 +117,7 @@ bool RISCVFoldMasks::convertVMergeToVMv(MachineInstr &MI,
     return false;
 
   assert(MI.getOperand(4).isReg() && MI.getOperand(4).getReg() == RISCV::V0);
-  if (!isAllOnesMask(V0Def))
+  if (!isAllOnesMask(V0Defs.lookup(&MI)))
     return false;
 
   MI.setDesc(TII->get(NewOpc));
@@ -133,14 +134,13 @@ bool RISCVFoldMasks::convertVMergeToVMv(MachineInstr &MI,
   return true;
 }
 
-bool RISCVFoldMasks::convertToUnmasked(MachineInstr &MI,
-                                       MachineInstr *MaskDef) const {
+bool RISCVFoldMasks::convertToUnmasked(MachineInstr &MI) const {
   const RISCV::RISCVMaskedPseudoInfo *I =
       RISCV::getMaskedPseudoInfo(MI.getOpcode());
   if (!I)
     return false;
 
-  if (!isAllOnesMask(MaskDef))
+  if (!isAllOnesMask(V0Defs.lookup(&MI)))
     return false;
 
   // There are two classes of pseudos in the table - compares and
@@ -198,20 +198,26 @@ bool RISCVFoldMasks::runOnMachineFunction(MachineFunction &MF) {
   // $v0:vr = COPY %mask:vr
   // %x:vr = Pseudo_MASK %a:vr, %b:br, $v0:vr
   //
-  // Because $v0 isn't in SSA, keep track of it so we can check the mask operand
-  // on each pseudo.
-  MachineInstr *CurrentV0Def;
-  for (MachineBasicBlock &MBB : MF) {
-    CurrentV0Def = nullptr;
-    for (MachineInstr &MI : MBB) {
-      Changed |= convertToUnmasked(MI, CurrentV0Def);
-      Changed |= convertVMergeToVMv(MI, CurrentV0Def);
+  // Because $v0 isn't in SSA, keep track of its definition at each use so we
+  // can check mask operands.
+  for (const MachineBasicBlock &MBB : MF) {
+    const MachineInstr *CurrentV0Def = nullptr;
+    for (const MachineInstr &MI : MBB) {
+      if (MI.readsRegister(RISCV::V0, TRI))
+        V0Defs[&MI] = CurrentV0Def;
 
       if (MI.definesRegister(RISCV::V0, TRI))
         CurrentV0Def = &MI;
     }
   }
 
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      Changed |= convertToUnmasked(MI);
+      Changed |= convertVMergeToVMv(MI);
+    }
+  }
+
   return Changed;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 3aa28215efc2..a3ebfb34ad7a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -559,11 +559,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   if (Subtarget.hasStdExtDOrZdinx()) {
     setOperationAction(FPLegalNodeTypes, MVT::f64, Legal);
 
+    if (!Subtarget.is64Bit())
+      setOperationAction(ISD::BITCAST, MVT::i64, Custom);
+
     if (Subtarget.hasStdExtZfa()) {
       setOperationAction(FPRndMode, MVT::f64, Legal);
       setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
-      if (!Subtarget.is64Bit())
-        setOperationAction(ISD::BITCAST, MVT::i64, Custom);
     } else {
       if (Subtarget.is64Bit())
         setOperationAction(FPRndMode, MVT::f64, Custom);
@@ -6071,8 +6072,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
           DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, NewOp0);
       return FPConv;
     }
-    if (VT == MVT::f64 && Op0VT == MVT::i64 && XLenVT == MVT::i32 &&
-        Subtarget.hasStdExtZfa()) {
+    if (VT == MVT::f64 && Op0VT == MVT::i64 && XLenVT == MVT::i32) {
       SDValue Lo, Hi;
       std::tie(Lo, Hi) = DAG.SplitScalar(Op0, DL, MVT::i32, MVT::i32);
       SDValue RetReg =
@@ -12157,8 +12157,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       SDValue FPConv =
           DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Op0);
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPConv));
-    } else if (VT == MVT::i64 && Op0VT == MVT::f64 && XLenVT == MVT::i32 &&
-               Subtarget.hasStdExtZfa()) {
+    } else if (VT == MVT::i64 && Op0VT == MVT::f64 && XLenVT == MVT::i32) {
       SDValue NewReg = DAG.getNode(RISCVISD::SplitF64, DL,
                                    DAG.getVTList(MVT::i32, MVT::i32), Op0);
       SDValue RetReg = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64,
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 10bf1e88d741..74d65324b95d 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -446,6 +446,13 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                (Lo12 & 0b11111) != 0) {
       // Prefetch instructions require the offset to be 32 byte aligned.
       MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
+    } else if ((Opc == RISCV::PseudoRV32ZdinxLD ||
+                Opc == RISCV::PseudoRV32ZdinxSD) &&
+               Lo12 >= 2044) {
+      // This instruction will be split into 2 instructions. The second
+      // instruction will add 4 to the immediate. If that would overflow 12
+      // bits, we can't fold the offset.
+      MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
     } else {
       // We can encode an add with 12 bit signed immediate in the immediate
       // operand of our user instruction.  As a result, the remaining
@@ -741,8 +748,11 @@ bool RISCVRegisterInfo::getRegAllocationHints(
                         bool NeedGPRC) -> void {
     Register Reg = MO.getReg();
     Register PhysReg = Reg.isPhysical() ? Reg : Register(VRM->getPhys(Reg));
-    if (PhysReg && (!NeedGPRC || RISCV::GPRCRegClass.contains(PhysReg))) {
-      assert(!MO.getSubReg() && !VRRegMO.getSubReg() && "Unexpected subreg!");
+    // TODO: Support GPRPair subregisters? Need to be careful with even/odd
+    // registers. If the virtual register is an odd register of a pair and the
+    // physical register is even (or vice versa), we should not add the hint.
+    if (PhysReg && (!NeedGPRC || RISCV::GPRCRegClass.contains(PhysReg)) &&
+        !MO.getSubReg() && !VRRegMO.getSubReg()) {
       if (!MRI->isReserved(PhysReg) && !is_contained(Hints, PhysReg))
         TwoAddrHints.insert(PhysReg);
     }
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 225b57554c1d..9da1f73681c6 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -573,7 +573,7 @@ let RegAltNameIndices = [ABIRegAltName] in {
 }
 
 let RegInfos = RegInfoByHwMode<[RV32, RV64],
-                               [RegInfo<64, 64, 64>, RegInfo<128, 128, 128>]>,
+                               [RegInfo<64, 64, 32>, RegInfo<128, 128, 64>]>,
     DecoderMethod = "DecodeGPRPairRegisterClass" in
 def GPRPair : RegisterClass<"RISCV", [XLenPairFVT], 64, (add
     X10_X11, X12_X13, X14_X15, X16_X17,
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index d336ab9d309c..e9bdc6ab5bea 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1480,6 +1480,14 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
 
   Kind = improveShuffleKindFromMask(Kind, Mask, BaseTp, Index, SubTp);
 
+  // Recognize a basic concat_vector shuffle.
+  if (Kind == TTI::SK_PermuteTwoSrc &&
+      Mask.size() == (2 * BaseTp->getElementCount().getKnownMinValue()) &&
+      ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
+    return getShuffleCost(TTI::SK_InsertSubvector,
+                          VectorType::getDoubleElementsVectorType(BaseTp), Mask,
+                          CostKind, Mask.size() / 2, BaseTp);
+
   // Treat Transpose as 2-op shuffles - there's no difference in lowering.
   if (Kind == TTI::SK_Transpose)
     Kind = TTI::SK_PermuteTwoSrc;
diff --git a/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp b/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp
index 0694d8f28df6..2e36d4a8b98c 100644
--- a/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp
+++ b/llvm/lib/TextAPI/BinaryReader/DylibReader.cpp
@@ -293,8 +293,11 @@ static Error readSymbols(MachOObjectFile *Obj, RecordsSlice &Slice,
     RecordLinkage Linkage = RecordLinkage::Unknown;
     SymbolFlags RecordFlags = SymbolFlags::None;
 
-    if (Opt.Undefineds && (Flags & SymbolRef::SF_Undefined)) {
-      Linkage = RecordLinkage::Undefined;
+    if (Flags & SymbolRef::SF_Undefined) {
+      if (Opt.Undefineds)
+        Linkage = RecordLinkage::Undefined;
+      else
+        continue;
       if (Flags & SymbolRef::SF_Weak)
         RecordFlags |= SymbolFlags::WeakReferenced;
     } else if (Flags & SymbolRef::SF_Exported) {
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index f98833bd1198..ff680e998e71 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -5190,6 +5190,12 @@ static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA,
   } else if (auto *LI = dyn_cast<LoadInst>(I)) {
     if (LI->getPointerOperand() == UseV)
       MA = LI->getAlign();
+  } else if (auto *AI = dyn_cast<AtomicRMWInst>(I)) {
+    if (AI->getPointerOperand() == UseV)
+      MA = AI->getAlign();
+  } else if (auto *AI = dyn_cast<AtomicCmpXchgInst>(I)) {
+    if (AI->getPointerOperand() == UseV)
+      MA = AI->getAlign();
   }
 
   if (!MA || *MA <= QueryingAA.getKnownAlign())
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index aaf7184a5562..a978e9a643f5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -819,7 +819,7 @@ static Instruction *foldNoWrapAdd(BinaryOperator &Add,
   Value *X;
   const APInt *C1, *C2;
   if (match(Op1, m_APInt(C1)) &&
-      match(Op0, m_OneUse(m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C2))))) &&
+      match(Op0, m_OneUse(m_ZExt(m_NUWAddLike(m_Value(X), m_APInt(C2))))) &&
       C1->isNegative() && C1->sge(-C2->sext(C1->getBitWidth()))) {
     Constant *NewC =
         ConstantInt::get(X->getType(), *C2 + C1->trunc(C2->getBitWidth()));
@@ -829,14 +829,16 @@ static Instruction *foldNoWrapAdd(BinaryOperator &Add,
   // More general combining of constants in the wide type.
   // (sext (X +nsw NarrowC)) + C --> (sext X) + (sext(NarrowC) + C)
   Constant *NarrowC;
-  if (match(Op0, m_OneUse(m_SExt(m_NSWAdd(m_Value(X), m_Constant(NarrowC)))))) {
+  if (match(Op0,
+            m_OneUse(m_SExt(m_NSWAddLike(m_Value(X), m_Constant(NarrowC)))))) {
     Value *WideC = Builder.CreateSExt(NarrowC, Ty);
     Value *NewC = Builder.CreateAdd(WideC, Op1C);
     Value *WideX = Builder.CreateSExt(X, Ty);
     return BinaryOperator::CreateAdd(WideX, NewC);
   }
   // (zext (X +nuw NarrowC)) + C --> (zext X) + (zext(NarrowC) + C)
-  if (match(Op0, m_OneUse(m_ZExt(m_NUWAdd(m_Value(X), m_Constant(NarrowC)))))) {
+  if (match(Op0,
+            m_OneUse(m_ZExt(m_NUWAddLike(m_Value(X), m_Constant(NarrowC)))))) {
     Value *WideC = Builder.CreateZExt(NarrowC, Ty);
     Value *NewC = Builder.CreateAdd(WideC, Op1C);
     Value *WideX = Builder.CreateZExt(X, Ty);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 426b548c074a..526fe5d08059 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2093,8 +2093,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     Value *Arg0 = II->getArgOperand(0);
     Value *Arg1 = II->getArgOperand(1);
     bool IsSigned = IID == Intrinsic::sadd_with_overflow;
-    bool HasNWAdd = IsSigned ? match(Arg0, m_NSWAdd(m_Value(X), m_APInt(C0)))
-                             : match(Arg0, m_NUWAdd(m_Value(X), m_APInt(C0)));
+    bool HasNWAdd = IsSigned
+                        ? match(Arg0, m_NSWAddLike(m_Value(X), m_APInt(C0)))
+                        : match(Arg0, m_NUWAddLike(m_Value(X), m_APInt(C0)));
     if (HasNWAdd && match(Arg1, m_APInt(C1))) {
       bool Overflow;
       APInt NewC =
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 9d4c271f990d..10a4b1c2060c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -814,8 +814,19 @@ Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) {
   if (match(Op1, m_SpecificFP(-1.0)))
     return UnaryOperator::CreateFNegFMF(Op0, &I);
 
-  // With no-nans: X * 0.0 --> copysign(0.0, X)
-  if (I.hasNoNaNs() && match(Op1, m_PosZeroFP())) {
+  // With no-nans/no-infs:
+  // X * 0.0 --> copysign(0.0, X)
+  // X * -0.0 --> copysign(0.0, -X)
+  const APFloat *FPC;
+  if (match(Op1, m_APFloatAllowUndef(FPC)) && FPC->isZero() &&
+      ((I.hasNoInfs() &&
+        isKnownNeverNaN(Op0, /*Depth=*/0, SQ.getWithInstruction(&I))) ||
+       isKnownNeverNaN(&I, /*Depth=*/0, SQ.getWithInstruction(&I)))) {
+    if (FPC->isNegative())
+      Op0 = Builder.CreateFNegFMF(Op0, &I);
+    Op1 = Constant::replaceUndefsWith(
+        cast<Constant>(Op1),
+        ConstantFP::get(Op1->getType()->getScalarType(), *FPC));
     CallInst *CopySign = Builder.CreateIntrinsic(Intrinsic::copysign,
                                                  {I.getType()}, {Op1, Op0}, &I);
     return replaceInstUsesWith(I, CopySign);
@@ -1160,14 +1171,14 @@ Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) {
     // We need a multiple of the divisor for a signed add constant, but
     // unsigned is fine with any constant pair.
     if (IsSigned &&
-        match(Op0, m_NSWAdd(m_NSWMul(m_Value(X), m_SpecificInt(*C2)),
-                            m_APInt(C1))) &&
+        match(Op0, m_NSWAddLike(m_NSWMul(m_Value(X), m_SpecificInt(*C2)),
+                                m_APInt(C1))) &&
         isMultiple(*C1, *C2, Quotient, IsSigned)) {
       return BinaryOperator::CreateNSWAdd(X, ConstantInt::get(Ty, Quotient));
     }
     if (!IsSigned &&
-        match(Op0, m_NUWAdd(m_NUWMul(m_Value(X), m_SpecificInt(*C2)),
-                            m_APInt(C1)))) {
+        match(Op0, m_NUWAddLike(m_NUWMul(m_Value(X), m_SpecificInt(*C2)),
+                                m_APInt(C1)))) {
       return BinaryOperator::CreateNUWAdd(X,
                                           ConstantInt::get(Ty, C1->udiv(*C2)));
     }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index eafd2889ec50..95aa2119e2d8 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -437,7 +437,7 @@ Instruction *InstCombinerImpl::commonShiftTransforms(BinaryOperator &I) {
   Value *A;
   Constant *C, *C1;
   if (match(Op0, m_Constant(C)) &&
-      match(Op1, m_NUWAdd(m_Value(A), m_Constant(C1)))) {
+      match(Op1, m_NUWAddLike(m_Value(A), m_Constant(C1)))) {
     Value *NewC = Builder.CreateBinOp(I.getOpcode(), C, C1);
     BinaryOperator *NewShiftOp = BinaryOperator::Create(I.getOpcode(), NewC, A);
     if (I.getOpcode() == Instruction::Shl) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 3c4c0f35eb6d..c7f4fb17648c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -2135,7 +2135,8 @@ static Instruction *foldSelectShuffleOfSelectShuffle(ShuffleVectorInst &Shuf) {
   return new ShuffleVectorInst(X, Y, NewMask);
 }
 
-static Instruction *foldSelectShuffleWith1Binop(ShuffleVectorInst &Shuf) {
+static Instruction *foldSelectShuffleWith1Binop(ShuffleVectorInst &Shuf,
+                                                const SimplifyQuery &SQ) {
   assert(Shuf.isSelect() && "Must have select-equivalent shuffle");
 
   // Are we shuffling together some value and that same value after it has been
@@ -2159,6 +2160,19 @@ static Instruction *foldSelectShuffleWith1Binop(ShuffleVectorInst &Shuf) {
   if (!IdC)
     return nullptr;
 
+  Value *X = Op0IsBinop ? Op1 : Op0;
+
+  // Prevent folding in the case the non-binop operand might have NaN values.
+  // If X can have NaN elements then we have that the floating point math
+  // operation in the transformed code may not preserve the exact NaN
+  // bit-pattern -- e.g. `fadd sNaN, 0.0 -> qNaN`.
+  // This makes the transformation incorrect since the original program would
+  // have preserved the exact NaN bit-pattern.
+  // Avoid the folding if X can have NaN elements.
+  if (Shuf.getType()->getElementType()->isFloatingPointTy() &&
+      !isKnownNeverNaN(X, 0, SQ))
+    return nullptr;
+
   // Shuffle identity constants into the lanes that return the original value.
   // Example: shuf (mul X, {-1,-2,-3,-4}), X, {0,5,6,3} --> mul X, {-1,1,1,-4}
   // Example: shuf X, (add X, {-1,-2,-3,-4}), {0,1,6,7} --> add X, {0,0,-3,-4}
@@ -2175,7 +2189,6 @@ static Instruction *foldSelectShuffleWith1Binop(ShuffleVectorInst &Shuf) {
 
   // shuf (bop X, C), X, M --> bop X, C'
   // shuf X, (bop X, C), M --> bop X, C'
-  Value *X = Op0IsBinop ? Op1 : Op0;
   Instruction *NewBO = BinaryOperator::Create(BOpcode, X, NewC);
   NewBO->copyIRFlags(BO);
 
@@ -2241,7 +2254,8 @@ Instruction *InstCombinerImpl::foldSelectShuffle(ShuffleVectorInst &Shuf) {
   if (Instruction *I = foldSelectShuffleOfSelectShuffle(Shuf))
     return I;
 
-  if (Instruction *I = foldSelectShuffleWith1Binop(Shuf))
+  if (Instruction *I = foldSelectShuffleWith1Binop(
+          Shuf, getSimplifyQuery().getWithInstruction(&Shuf)))
     return I;
 
   BinaryOperator *B0, *B1;
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 4bdeb6bbab85..3c95610fa3e8 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -363,7 +363,7 @@ private:
   Value *getAllocaTag(IRBuilder<> &IRB, Value *StackTag, unsigned AllocaNo);
   Value *getUARTag(IRBuilder<> &IRB);
 
-  Value *getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty);
+  Value *getHwasanThreadSlotPtr(IRBuilder<> &IRB);
   Value *applyTagMask(IRBuilder<> &IRB, Value *OldTag);
   unsigned retagMask(unsigned AllocaNo);
 
@@ -1219,20 +1219,10 @@ Value *HWAddressSanitizer::untagPointer(IRBuilder<> &IRB, Value *PtrLong) {
   return UntaggedPtrLong;
 }
 
-Value *HWAddressSanitizer::getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty) {
-  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
-  if (TargetTriple.isAArch64() && TargetTriple.isAndroid()) {
-    // Android provides a fixed TLS slot for sanitizers. See TLS_SLOT_SANITIZER
-    // in Bionic's libc/private/bionic_tls.h.
-    Function *ThreadPointerFunc =
-        Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
-    return IRB.CreateConstGEP1_32(Int8Ty, IRB.CreateCall(ThreadPointerFunc),
-                                  0x30);
-  }
-  if (ThreadPtrGlobal)
-    return ThreadPtrGlobal;
-
-  return nullptr;
+Value *HWAddressSanitizer::getHwasanThreadSlotPtr(IRBuilder<> &IRB) {
+  if (TargetTriple.isAArch64() && TargetTriple.isAndroid())
+    return memtag::getAndroidSanitizerSlotPtr(IRB);
+  return ThreadPtrGlobal;
 }
 
 Value *HWAddressSanitizer::getCachedFP(IRBuilder<> &IRB) {
@@ -1271,7 +1261,7 @@ void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) {
 
   auto getThreadLongMaybeUntagged = [&]() {
     if (!SlotPtr)
-      SlotPtr = getHwasanThreadSlotPtr(IRB, IntptrTy);
+      SlotPtr = getHwasanThreadSlotPtr(IRB);
     if (!ThreadLong)
       ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr);
     // Extract the address field from ThreadLong. Unnecessary on AArch64 with
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 55728709cde5..50eccc69a38a 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -920,7 +920,7 @@ static void instrumentOneFunc(
   // on the instrumentation call based on the funclet coloring.
   DenseMap<BasicBlock *, ColorVector> BlockColors;
   if (F.hasPersonalityFn() &&
-      isFuncletEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
+      isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
     BlockColors = colorEHFunclets(F);
 
   // For each VP Kind, walk the VP candidates and instrument each one.
diff --git a/llvm/lib/Transforms/Scalar/Float2Int.cpp b/llvm/lib/Transforms/Scalar/Float2Int.cpp
index ccca8bcc1a56..de8c05d5689f 100644
--- a/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -359,9 +359,7 @@ bool Float2IntPass::validateAndTransform() {
 
     // The number of bits required is the maximum of the upper and
     // lower limits, plus one so it can be signed.
-    unsigned MinBW = std::max(R.getLower().getSignificantBits(),
-                              R.getUpper().getSignificantBits()) +
-                     1;
+    unsigned MinBW = R.getMinSignedBits() + 1;
     LLVM_DEBUG(dbgs() << "F2I: MinBitwidth=" << MinBW << ", R: " << R << "\n");
 
     // If we've run off the realms of the exactly representable integers,
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 8dd1002a6e4a..fd94a120bc66 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -273,5 +273,15 @@ Value *getFP(IRBuilder<> &IRB) {
       IRB.getIntPtrTy(M->getDataLayout()));
 }
 
+Value *getAndroidSanitizerSlotPtr(IRBuilder<> &IRB) {
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  // Android provides a fixed TLS slot for sanitizers. See TLS_SLOT_SANITIZER
+  // in Bionic's libc/private/bionic_tls.h.
+  Function *ThreadPointerFunc =
+      Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
+  return IRB.CreateConstGEP1_32(IRB.getInt8Ty(),
+                                IRB.CreateCall(ThreadPointerFunc), 0x30);
+}
+
 } // namespace memtag
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a52064e5417b..36b446962c4a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9275,11 +9275,16 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
 
   // Check if any of the gather node forms an insertelement buildvector
   // somewhere.
-  if (any_of(VectorizableTree, [](const std::unique_ptr<TreeEntry> &TE) {
+  bool IsAllowedSingleBVNode =
+      VectorizableTree.size() > 1 ||
+      (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
+       allSameBlock(VectorizableTree.front()->Scalars));
+  if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
         return TE->State == TreeEntry::NeedToGather &&
-               all_of(TE->Scalars, [](Value *V) {
+               all_of(TE->Scalars, [&](Value *V) {
                  return isa<ExtractElementInst, UndefValue>(V) ||
-                        (!V->hasNUsesOrMore(UsesLimit) &&
+                        (IsAllowedSingleBVNode &&
+                         !V->hasNUsesOrMore(UsesLimit) &&
                          any_of(V->users(), [](User *U) {
                            return isa<InsertElementInst>(U);
                          }));
@@ -13995,7 +14000,7 @@ bool BoUpSLP::collectValuesToDemote(
     if (MultiNodeScalars.contains(V))
       return false;
     uint32_t OrigBitWidth = DL->getTypeSizeInBits(V->getType());
-    if (OrigBitWidth < BitWidth) {
+    if (OrigBitWidth > BitWidth) {
       APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
       if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
         return true;
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 23494314f132..7e86137f23f3 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -66,8 +66,8 @@ class VectorCombine {
 public:
   VectorCombine(Function &F, const TargetTransformInfo &TTI,
                 const DominatorTree &DT, AAResults &AA, AssumptionCache &AC,
-                bool TryEarlyFoldsOnly)
-      : F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC),
+                const DataLayout *DL, bool TryEarlyFoldsOnly)
+      : F(F), Builder(F.getContext()), TTI(TTI), DT(DT), AA(AA), AC(AC), DL(DL),
         TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
 
   bool run();
@@ -79,6 +79,7 @@ private:
   const DominatorTree &DT;
   AAResults &AA;
   AssumptionCache &AC;
+  const DataLayout *DL;
 
   /// If true, only perform beneficial early IR transforms. Do not introduce new
   /// vector operations.
@@ -181,7 +182,6 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   // We use minimal alignment (maximum flexibility) because we only care about
   // the dereferenceable region. When calculating cost and creating a new op,
   // we may use a larger value based on alignment attributes.
-  const DataLayout &DL = I.getModule()->getDataLayout();
   Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
   assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
 
@@ -189,15 +189,15 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
   unsigned OffsetEltIndex = 0;
   Align Alignment = Load->getAlign();
-  if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &AC,
+  if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
                                    &DT)) {
     // It is not safe to load directly from the pointer, but we can still peek
     // through gep offsets and check if it safe to load from a base address with
     // updated alignment. If it is, we can shuffle the element(s) into place
     // after loading.
-    unsigned OffsetBitWidth = DL.getIndexTypeSizeInBits(SrcPtr->getType());
+    unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
     APInt Offset(OffsetBitWidth, 0);
-    SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
+    SrcPtr = SrcPtr->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset);
 
     // We want to shuffle the result down from a high element of a vector, so
     // the offset must be positive.
@@ -215,7 +215,7 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
     if (OffsetEltIndex >= MinVecNumElts)
       return false;
 
-    if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), DL, Load, &AC,
+    if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
                                      &DT))
       return false;
 
@@ -227,7 +227,7 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
 
   // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
   // Use the greater of the alignment on the load or its source pointer.
-  Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment);
+  Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
   Type *LoadTy = Load->getType();
   unsigned AS = Load->getPointerAddressSpace();
   InstructionCost OldCost =
@@ -298,14 +298,13 @@ bool VectorCombine::widenSubvectorLoad(Instruction &I) {
   // the dereferenceable region. When calculating cost and creating a new op,
   // we may use a larger value based on alignment attributes.
   auto *Ty = cast<FixedVectorType>(I.getType());
-  const DataLayout &DL = I.getModule()->getDataLayout();
   Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
   assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
   Align Alignment = Load->getAlign();
-  if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), DL, Load, &AC, &DT))
+  if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT))
     return false;
 
-  Alignment = std::max(SrcPtr->getPointerAlignment(DL), Alignment);
+  Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
   Type *LoadTy = Load->getType();
   unsigned AS = Load->getPointerAddressSpace();
 
@@ -854,7 +853,6 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
   // Scalarize the intrinsic
   ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
   Value *EVL = VPI.getArgOperand(3);
-  const DataLayout &DL = VPI.getModule()->getDataLayout();
 
   // If the VP op might introduce UB or poison, we can scalarize it provided
   // that we know the EVL > 0: If the EVL is zero, then the original VP op
@@ -867,7 +865,7 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
   else
     SafeToSpeculate = isSafeToSpeculativelyExecuteWithOpcode(
         *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
-  if (!SafeToSpeculate && !isKnownNonZero(EVL, DL, 0, &AC, &VPI, &DT))
+  if (!SafeToSpeculate && !isKnownNonZero(EVL, *DL, 0, &AC, &VPI, &DT))
     return false;
 
   Value *ScalarVal =
@@ -1246,12 +1244,11 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) {
 
   if (auto *Load = dyn_cast<LoadInst>(Source)) {
     auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
-    const DataLayout &DL = I.getModule()->getDataLayout();
     Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
     // Don't optimize for atomic/volatile load or store. Ensure memory is not
     // modified between, vector type matches store size, and index is inbounds.
     if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
-        !DL.typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
+        !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
         SrcAddr != SI->getPointerOperand()->stripPointerCasts())
       return false;
 
@@ -1270,7 +1267,7 @@ bool VectorCombine::foldSingleElementStore(Instruction &I) {
     NSI->copyMetadata(*SI);
     Align ScalarOpAlignment = computeAlignmentAfterScalarization(
         std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
-        DL);
+        *DL);
     NSI->setAlignment(ScalarOpAlignment);
     replaceValue(I, *NSI);
     eraseInstruction(I);
@@ -1288,8 +1285,7 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
 
   auto *VecTy = cast<VectorType>(I.getType());
   auto *LI = cast<LoadInst>(&I);
-  const DataLayout &DL = I.getModule()->getDataLayout();
-  if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(VecTy->getScalarType()))
+  if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
     return false;
 
   InstructionCost OriginalCost =
@@ -1367,7 +1363,7 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
         VecTy->getElementType(), GEP, EI->getName() + ".scalar"));
 
     Align ScalarOpAlignment = computeAlignmentAfterScalarization(
-        LI->getAlign(), VecTy->getElementType(), Idx, DL);
+        LI->getAlign(), VecTy->getElementType(), Idx, *DL);
     NewLoad->setAlignment(ScalarOpAlignment);
 
     replaceValue(*EI, *NewLoad);
@@ -2042,7 +2038,8 @@ PreservedAnalyses VectorCombinePass::run(Function &F,
   TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
   DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
   AAResults &AA = FAM.getResult<AAManager>(F);
-  VectorCombine Combiner(F, TTI, DT, AA, AC, TryEarlyFoldsOnly);
+  const DataLayout *DL = &F.getParent()->getDataLayout();
+  VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TryEarlyFoldsOnly);
   if (!Combiner.run())
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 623c43d564cc..8159d7f8a0a1 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -435,7 +435,7 @@ if(runtimes)
       list(APPEND extra_deps "flang-new")
     endif()
     foreach(dep opt llvm-link llvm-extract clang clang-offload-packager)
-      if(TARGET ${dep} AND OPENMP_ENABLE_LIBOMPTARGET)
+      if(TARGET ${dep})
         list(APPEND extra_deps ${dep})
       endif()
     endforeach()
@@ -531,7 +531,7 @@ if(runtimes)
       check_apple_target(${name} runtime)
 
       runtime_register_target(${name}
-        DEPENDS ${builtins_dep_name} ${hdrgen_deps}
+        DEPENDS ${builtins_dep_name} ${extra_deps}
         CMAKE_ARGS -DLLVM_DEFAULT_TARGET_TRIPLE=${name} ${libc_cmake_args}
         EXTRA_ARGS TARGET_TRIPLE ${name})
     endforeach()
diff --git a/llvm/test/Analysis/AliasSet/intrinsics.ll b/llvm/test/Analysis/AliasSet/intrinsics.ll
index aeb5424ca919..678d6d246e63 100644
--- a/llvm/test/Analysis/AliasSet/intrinsics.ll
+++ b/llvm/test/Analysis/AliasSet/intrinsics.ll
@@ -1,9 +1,8 @@
-; RUN: opt -passes=print-alias-sets -S -o - < %s 2>&1 | FileCheck %s
+; RUN: opt -passes=print-alias-sets -S -o - < %s 2>&1 | FileCheck %s --implicit-check-not="Unknown instructions"
 
 ; CHECK: Alias sets for function 'test1':
 ; CHECK: Alias Set Tracker: 2 alias sets for 2 pointer values.
 ; CHECK:   AliasSet[0x{{[0-9a-f]+}}, 1] must alias, Mod       Memory locations: (ptr %a, LocationSize::precise(1))
-; CHECK-NOT: 1 Unknown instruction
 ; CHECK:   AliasSet[0x{{[0-9a-f]+}}, 1] must alias, Mod       Memory locations: (ptr %b, LocationSize::precise(1))
 define void @test1(i32 %c) {
 entry:
@@ -64,7 +63,6 @@ entry:
 ; CHECK: Alias sets for function 'test5':
 ; CHECK: Alias Set Tracker: 2 alias sets for 2 pointer values.
 ; CHECK:   AliasSet[0x{{[0-9a-f]+}}, 1] must alias, Mod       Memory locations: (ptr %a, LocationSize::precise(1))
-; CHECK-NOT: 1 Unknown instruction
 ; CHECK:   AliasSet[0x{{[0-9a-f]+}}, 1] must alias, Mod       Memory locations: (ptr %b, LocationSize::precise(1))
 define void @test5() {
 entry:
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
index 6fced31a622d..ec66892b98fc 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-overflow.mir
@@ -92,3 +92,87 @@ body:             |
     $w1 = COPY %o_wide
     RET_ReallyLR implicit $w0
 ...
+---
+name:            add_multiuse
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: add_multiuse
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %const:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: $w0 = COPY [[COPY]](s32)
+    ; CHECK-NEXT: $w1 = COPY [[COPY]](s32)
+    ; CHECK-NEXT: $w2 = COPY %const(s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %const:_(s32) = G_CONSTANT i32 0
+    %add:_(s32), %o:_(s1) = G_SADDO %0, %const
+    %o_wide:_(s32) = G_ZEXT %o(s1)
+    $w0 = COPY %add(s32)
+    $w1 = COPY %add(s32)
+    $w2 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...
+---
+name:            add_vector
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: add_vector
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $w3
+    ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK-NEXT: %bv1:_(<4 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32), [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: %add:_(<4 x s32>), %o:_(<4 x s1>) = G_UADDO %bv0, %bv1
+    ; CHECK-NEXT: %o_wide:_(<4 x s32>) = G_ZEXT %o(<4 x s1>)
+    ; CHECK-NEXT: $q0 = COPY %add(<4 x s32>)
+    ; CHECK-NEXT: $q1 = COPY %o_wide(<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %2:_(s32) = COPY $w2
+    %3:_(s32) = COPY $w3
+    %bv0:_(<4 x s32>) = G_BUILD_VECTOR %0:_(s32), %1:_(s32), %0:_(s32), %1:_(s32)
+    %bv1:_(<4 x s32>) = G_BUILD_VECTOR %2:_(s32), %3:_(s32), %2:_(s32), %3:_(s32)
+    %add:_(<4 x s32>), %o:_(<4 x s1>) = G_UADDO %bv0, %bv1
+    %o_wide:_(<4 x s32>) = G_ZEXT %o(<4 x s1>)
+    $q0 = COPY %add(<4 x s32>)
+    $q1 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...
+---
+name:            add_splat_vector
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+    ; CHECK-LABEL: name: add_splat_vector
+    ; CHECK: liveins: $w0, $w1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK-NEXT: %bv0:_(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY]](s32), [[COPY1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+    ; CHECK-NEXT: %o:_(<4 x s1>) = G_BUILD_VECTOR [[C]](s1), [[C]](s1), [[C]](s1), [[C]](s1)
+    ; CHECK-NEXT: %o_wide:_(<4 x s32>) = G_ZEXT %o(<4 x s1>)
+    ; CHECK-NEXT: $q0 = COPY %bv0(<4 x s32>)
+    ; CHECK-NEXT: $q1 = COPY %o_wide(<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = COPY $w1
+    %2:_(s32) = COPY $w2
+    %3:_(s32) = COPY $w3
+    %const:_(s32) = G_CONSTANT i32 0
+    %bv0:_(<4 x s32>) = G_BUILD_VECTOR %0:_(s32), %1:_(s32), %0:_(s32), %1:_(s32)
+    %bv1:_(<4 x s32>) = G_BUILD_VECTOR %const:_(s32), %const:_(s32), %const:_(s32), %const:_(s32)
+    %add:_(<4 x s32>), %o:_(<4 x s1>) = G_SADDO %bv0, %bv1
+    %o_wide:_(<4 x s32>) = G_ZEXT %o(<4 x s1>)
+    $q0 = COPY %add(<4 x s32>)
+    $q1 = COPY %o_wide
+    RET_ReallyLR implicit $w0
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-abs.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-abs.mir
index 3123e304116f..0d429ae38402 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-abs.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-abs.mir
@@ -8,11 +8,12 @@ body:             |
   bb.0:
     ; CHECK-LABEL: name: abs_s32
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 31
-    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C]](s64)
-    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[ASHR]]
-    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ADD]], [[ASHR]]
-    ; CHECK-NEXT: $w0 = COPY [[XOR]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY]](s32), [[C]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[SUB]]
+    ; CHECK-NEXT: $w0 = COPY [[SELECT]](s32)
+    ;
     ; CHECK-CSSC-LABEL: name: abs_s32
     ; CHECK-CSSC: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
     ; CHECK-CSSC-NEXT: [[ABS:%[0-9]+]]:_(s32) = G_ABS [[COPY]]
@@ -28,11 +29,12 @@ body:             |
   bb.0:
     ; CHECK-LABEL: name: abs_s64
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
-    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY]], [[C]](s64)
-    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[COPY]], [[ASHR]]
-    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[ADD]], [[ASHR]]
-    ; CHECK-NEXT: $x0 = COPY [[XOR]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB [[C]], [[COPY]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY]](s64), [[C]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[COPY]], [[SUB]]
+    ; CHECK-NEXT: $x0 = COPY [[SELECT]](s64)
+    ;
     ; CHECK-CSSC-LABEL: name: abs_s64
     ; CHECK-CSSC: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-CSSC-NEXT: [[ABS:%[0-9]+]]:_(s64) = G_ABS [[COPY]]
@@ -55,6 +57,7 @@ body:             |
     ; CHECK-NEXT: [[ABS:%[0-9]+]]:_(<4 x s16>) = G_ABS [[COPY]]
     ; CHECK-NEXT: $d0 = COPY [[ABS]](<4 x s16>)
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    ;
     ; CHECK-CSSC-LABEL: name: abs_v4s16
     ; CHECK-CSSC: liveins: $d0
     ; CHECK-CSSC-NEXT: {{  $}}
@@ -82,6 +85,7 @@ body:             |
     ; CHECK-NEXT: [[ABS:%[0-9]+]]:_(<8 x s16>) = G_ABS [[COPY]]
     ; CHECK-NEXT: $q0 = COPY [[ABS]](<8 x s16>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    ;
     ; CHECK-CSSC-LABEL: name: abs_v8s16
     ; CHECK-CSSC: liveins: $q0
     ; CHECK-CSSC-NEXT: {{  $}}
@@ -109,6 +113,7 @@ body:             |
     ; CHECK-NEXT: [[ABS:%[0-9]+]]:_(<2 x s32>) = G_ABS [[COPY]]
     ; CHECK-NEXT: $d0 = COPY [[ABS]](<2 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    ;
     ; CHECK-CSSC-LABEL: name: abs_v2s32
     ; CHECK-CSSC: liveins: $d0
     ; CHECK-CSSC-NEXT: {{  $}}
@@ -136,6 +141,7 @@ body:             |
     ; CHECK-NEXT: [[ABS:%[0-9]+]]:_(<4 x s32>) = G_ABS [[COPY]]
     ; CHECK-NEXT: $q0 = COPY [[ABS]](<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    ;
     ; CHECK-CSSC-LABEL: name: abs_v4s32
     ; CHECK-CSSC: liveins: $q0
     ; CHECK-CSSC-NEXT: {{  $}}
@@ -163,6 +169,7 @@ body:             |
     ; CHECK-NEXT: [[ABS:%[0-9]+]]:_(<8 x s8>) = G_ABS [[COPY]]
     ; CHECK-NEXT: $d0 = COPY [[ABS]](<8 x s8>)
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    ;
     ; CHECK-CSSC-LABEL: name: abs_v4s8
     ; CHECK-CSSC: liveins: $d0
     ; CHECK-CSSC-NEXT: {{  $}}
@@ -190,6 +197,7 @@ body:             |
     ; CHECK-NEXT: [[ABS:%[0-9]+]]:_(<16 x s8>) = G_ABS [[COPY]]
     ; CHECK-NEXT: $q0 = COPY [[ABS]](<16 x s8>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    ;
     ; CHECK-CSSC-LABEL: name: abs_v16s8
     ; CHECK-CSSC: liveins: $q0
     ; CHECK-CSSC-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll
index e00f70b94e3b..78c1ff7b9937 100644
--- a/llvm/test/CodeGen/AArch64/abs.ll
+++ b/llvm/test/CodeGen/AArch64/abs.ll
@@ -15,9 +15,8 @@ define i8 @abs_i8(i8 %a){
 ; CHECK-GI-LABEL: abs_i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sxtb w8, w0
-; CHECK-GI-NEXT:    asr w8, w8, #7
-; CHECK-GI-NEXT:    add w9, w0, w8
-; CHECK-GI-NEXT:    eor w0, w9, w8
+; CHECK-GI-NEXT:    cmp w8, #0
+; CHECK-GI-NEXT:    cneg w0, w0, le
 ; CHECK-GI-NEXT:    ret
 entry:
   %res = call i8 @llvm.abs.i8(i8 %a, i1 0)
@@ -36,9 +35,8 @@ define i16 @abs_i16(i16 %a){
 ; CHECK-GI-LABEL: abs_i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sxth w8, w0
-; CHECK-GI-NEXT:    asr w8, w8, #15
-; CHECK-GI-NEXT:    add w9, w0, w8
-; CHECK-GI-NEXT:    eor w0, w9, w8
+; CHECK-GI-NEXT:    cmp w8, #0
+; CHECK-GI-NEXT:    cneg w0, w0, le
 ; CHECK-GI-NEXT:    ret
 entry:
   %res = call i16 @llvm.abs.i16(i16 %a, i1 0)
@@ -55,9 +53,8 @@ define i32 @abs_i32(i32 %a){
 ;
 ; CHECK-GI-LABEL: abs_i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    asr w8, w0, #31
-; CHECK-GI-NEXT:    add w9, w0, w8
-; CHECK-GI-NEXT:    eor w0, w9, w8
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    cneg w0, w0, le
 ; CHECK-GI-NEXT:    ret
 entry:
   %res = call i32 @llvm.abs.i32(i32 %a, i1 0)
@@ -74,9 +71,8 @@ define i64 @abs_i64(i64 %a){
 ;
 ; CHECK-GI-LABEL: abs_i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    asr x8, x0, #63
-; CHECK-GI-NEXT:    add x9, x0, x8
-; CHECK-GI-NEXT:    eor x0, x9, x8
+; CHECK-GI-NEXT:    cmp x0, #0
+; CHECK-GI-NEXT:    cneg x0, x0, le
 ; CHECK-GI-NEXT:    ret
 entry:
   %res = call i64 @llvm.abs.i64(i64 %a, i1 0)
@@ -248,9 +244,9 @@ define <1 x i32> @abs_v1i32(<1 x i32> %a){
 ; CHECK-GI-LABEL: abs_v1i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    asr w9, w8, #31
-; CHECK-GI-NEXT:    add w8, w8, w9
-; CHECK-GI-NEXT:    eor w8, w8, w9
+; CHECK-GI-NEXT:    fmov w9, s0
+; CHECK-GI-NEXT:    cmp w8, #0
+; CHECK-GI-NEXT:    cneg w8, w9, le
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-xaluo.ll b/llvm/test/CodeGen/AArch64/arm64-xaluo.ll
index 77c70668b65a..0ec2d763685e 100644
--- a/llvm/test/CodeGen/AArch64/arm64-xaluo.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-xaluo.ll
@@ -2643,8 +2643,7 @@ define i8 @pr60530() {
 ;
 ; GISEL-LABEL: pr60530:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    mov w8, #1 // =0x1
-; GISEL-NEXT:    sbfx w0, w8, #0, #1
+; GISEL-NEXT:    mov w0, #255 // =0xff
 ; GISEL-NEXT:    ret
   %1 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 0, i8 1)
   %2 = extractvalue { i8, i1 } %1, 1
diff --git a/llvm/test/CodeGen/AArch64/overflow.ll b/llvm/test/CodeGen/AArch64/overflow.ll
index 1fd60c030979..977141f2b84f 100644
--- a/llvm/test/CodeGen/AArch64/overflow.ll
+++ b/llvm/test/CodeGen/AArch64/overflow.ll
@@ -64,21 +64,10 @@ entry:
 }
 
 define i32 @saddo.select.i64(i32 %v1, i32 %v2, i1 %v3, i64 %v4, i64 %v5) {
-; SDAG-LABEL: saddo.select.i64:
-; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    mov w0, w1
-; SDAG-NEXT:    ret
-;
-; GISEL-LABEL: saddo.select.i64:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    mov w8, #13 // =0xd
-; GISEL-NEXT:    and x9, x3, #0xc
-; GISEL-NEXT:    and x8, x4, x8
-; GISEL-NEXT:    cmn x9, x8
-; GISEL-NEXT:    cset w8, vs
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
-; GISEL-NEXT:    ret
+; CHECK-LABEL: saddo.select.i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, w1
+; CHECK-NEXT:    ret
 entry:
   %lhs = and i64 %v4, 12
   %rhs = and i64 %v5, 13
@@ -89,22 +78,10 @@ entry:
 }
 
 define i32 @uaddo.select.i64(i32 %v1, i32 %v2, i1 %v3, i64 %v4, i64 %v5) {
-; SDAG-LABEL: uaddo.select.i64:
-; SDAG:       // %bb.0: // %entry
-; SDAG-NEXT:    mov w0, w1
-; SDAG-NEXT:    ret
-;
-; GISEL-LABEL: uaddo.select.i64:
-; GISEL:       // %bb.0: // %entry
-; GISEL-NEXT:    mov w8, #9 // =0x9
-; GISEL-NEXT:    mov w9, #10 // =0xa
-; GISEL-NEXT:    and x8, x3, x8
-; GISEL-NEXT:    and x9, x4, x9
-; GISEL-NEXT:    cmn x8, x9
-; GISEL-NEXT:    cset w8, hs
-; GISEL-NEXT:    tst w8, #0x1
-; GISEL-NEXT:    csel w0, w0, w1, ne
-; GISEL-NEXT:    ret
+; CHECK-LABEL: uaddo.select.i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w0, w1
+; CHECK-NEXT:    ret
 entry:
   %lhs = and i64 %v4, 9
   %rhs = and i64 %v5, 10
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
index 5effd24a7520..fad833c0a6ad 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-non-fixed.ll
@@ -50,10 +50,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg
-  ; CHECK-NEXT:   $sgpr4 = COPY [[C]](s32)
+  ; CHECK-NEXT:   $sgpr0 = COPY [[C]](s32)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42)
@@ -99,11 +99,11 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8)
   ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
-  ; CHECK-NEXT:   $sgpr4 = COPY [[ANYEXT1]](s32)
-  ; CHECK-NEXT:   $sgpr5 = COPY [[LOAD2]](s32)
+  ; CHECK-NEXT:   $sgpr0 = COPY [[ANYEXT1]](s32)
+  ; CHECK-NEXT:   $sgpr1 = COPY [[LOAD2]](s32)
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index 392b0ae6823e..75670604baa1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -942,10 +942,10 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_i32_imm_inreg(i32 inreg
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_i32_inreg
-  ; CHECK-NEXT:   $sgpr4 = COPY [[C]](s32)
+  ; CHECK-NEXT:   $sgpr0 = COPY [[C]](s32)
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY1]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   call amdgpu_gfx void @external_gfx_void_func_i32_inreg(i32 inreg 42)
@@ -3984,11 +3984,11 @@ define amdgpu_gfx void @test_gfx_call_external_void_func_struct_i8_i32_inreg() #
   ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_gfx_void_func_struct_i8_i32_inreg
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[LOAD1]](s8)
   ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ANYEXT]](s16)
-  ; CHECK-NEXT:   $sgpr4 = COPY [[ANYEXT1]](s32)
-  ; CHECK-NEXT:   $sgpr5 = COPY [[LOAD2]](s32)
+  ; CHECK-NEXT:   $sgpr0 = COPY [[ANYEXT1]](s32)
+  ; CHECK-NEXT:   $sgpr1 = COPY [[LOAD2]](s32)
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr4, implicit $sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_gfx_void_func_struct_i8_i32_inreg, csr_amdgpu_si_gfx, implicit $sgpr0, implicit $sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
   ; CHECK-NEXT:   SI_RETURN
   %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index 66034af5c351..cff9ce050667 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -233,9 +233,9 @@ attributes #1 = { nounwind }
 ; AKF_HSA: attributes #[[ATTR1]] = { nounwind }
 ;.
 ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
 ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
new file mode 100644
index 000000000000..33b1cc65dc56
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
@@ -0,0 +1,255 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 4
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor %s | FileCheck %s
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "a"(i32 poison)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_def() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[DEF:%.*]] = call i32 asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  %def = call i32 asm sideeffect "; def $0", "=a"()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_physreg_def_tuple() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_def_tuple(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    [[DEF:%.*]] = call i64 asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  %def = call i64 asm sideeffect "; def $0", "={a[0:1]}"()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_second_arg() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_second_arg(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "v,a"(i32 poison, i32 poison)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_non_agpr_asm() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_non_agpr_asm(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "v"(i32 poison)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_physreg() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "{a0}"(i32 poison)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_physreg_tuple() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_tuple(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "{a[0:1]}"(i64 poison)
+  ret void
+}
+
+define void @func_uses_asm_virtreg_agpr() {
+; CHECK-LABEL: define void @func_uses_asm_virtreg_agpr(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "a"(i32 poison)
+  ret void
+}
+
+define void @func_uses_asm_physreg_agpr() {
+; CHECK-LABEL: define void @func_uses_asm_physreg_agpr(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "{a0}"(i32 poison)
+  ret void
+}
+
+define void @func_uses_asm_physreg_agpr_tuple() {
+; CHECK-LABEL: define void @func_uses_asm_physreg_agpr_tuple(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+  call void asm sideeffect "; use $0", "{a[0:1]}"(i64 poison)
+  ret void
+}
+
+declare void @unknown()
+
+define amdgpu_kernel void @kernel_calls_extern() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern(
+; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    ret void
+;
+  call void @unknown()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite(
+; CHECK-SAME: ) #[[ATTR4]] {
+; CHECK-NEXT:    call void @unknown() #[[ATTR9:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  call void @unknown() #0
+  ret void
+}
+
+define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect(
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    call void [[INDIRECT]]()
+; CHECK-NEXT:    ret void
+;
+  call void %indirect()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT:    call void [[INDIRECT]]() #[[ATTR9]]
+; CHECK-NEXT:    ret void
+;
+  call void %indirect() #0
+  ret void
+}
+
+define amdgpu_kernel void @kernel_transitively_uses_agpr_asm() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_transitively_uses_agpr_asm(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @func_uses_asm_physreg_agpr()
+; CHECK-NEXT:    ret void
+;
+  call void @func_uses_asm_physreg_agpr()
+  ret void
+}
+
+define void @empty() {
+; CHECK-LABEL: define void @empty(
+; CHECK-SAME: ) #[[ATTR5:[0-9]+]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define void @also_empty() {
+; CHECK-LABEL: define void @also_empty(
+; CHECK-SAME: ) #[[ATTR5]] {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define amdgpu_kernel void @kernel_calls_empty() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_empty(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    call void @empty()
+; CHECK-NEXT:    ret void
+;
+  call void @empty()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_calls_non_agpr_and_agpr() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_non_agpr_and_agpr(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @empty()
+; CHECK-NEXT:    call void @func_uses_asm_physreg_agpr()
+; CHECK-NEXT:    ret void
+;
+  call void @empty()
+  call void @func_uses_asm_physreg_agpr()
+  ret void
+}
+
+define amdgpu_kernel void @kernel_calls_generic_intrinsic(ptr %ptr0, ptr %ptr1, i64 %size) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_generic_intrinsic(
+; CHECK-SAME: ptr [[PTR0:%.*]], ptr [[PTR1:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[PTR0]], ptr [[PTR1]], i64 [[SIZE]], i1 false)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memcpy.p0.p0.i64(ptr %ptr0, ptr %ptr1, i64 %size, i1 false)
+  ret void
+}
+
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+define amdgpu_kernel void @kernel_calls_mfma.f32.32x32x1f32(ptr addrspace(1) %out, float %a, float %b, <32 x float> %c) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_mfma.f32.32x32x1f32(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float [[B:%.*]], <32 x float> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[RESULT:%.*]] = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float [[A]], float [[B]], <32 x float> [[C]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    store <32 x float> [[RESULT]], ptr addrspace(1) [[OUT]], align 128
+; CHECK-NEXT:    ret void
+;
+  %result = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0)
+  store <32 x float> %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @kernel_calls_workitem_id_x(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_workitem_id_x(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[RESULT:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    store i32 [[RESULT]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %result = call i32 @llvm.amdgcn.workitem.id.x()
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @indirect_calls_none_agpr(
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[FPTR:%.*]] = select i1 [[COND]], ptr @empty, ptr @also_empty
+; CHECK-NEXT:    call void [[FPTR]]()
+; CHECK-NEXT:    ret void
+;
+  %fptr = select i1 %cond, ptr @empty, ptr @also_empty
+  call void %fptr()
+  ret void
+}
+
+
+attributes #0 = { "amdgpu-no-agpr" }
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index af0eb23d8e99..3d4ae84d9c69 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -1025,33 +1025,33 @@ attributes #6 = { "enqueued-block" }
 ; AKF_HSA: attributes #[[ATTR8]] = { "amdgpu-calls" }
 ;.
 ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind sanitize_address "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind sanitize_address "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR23:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR24:[0-9]+]] = { "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR26]] = { "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR27]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR27]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR28]] = { nounwind }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR29]] = { "enqueued-block" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index 9a9c28ac632f..43cdf85ed381 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -643,19 +643,19 @@ attributes #1 = { nounwind }
 ; AKF_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-stack-objects" }
 ;.
 ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
 ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
index 6c5e58c74033..547ff69592ca 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
@@ -393,17 +393,18 @@ define amdgpu_kernel void @use_get_local_size_z(ptr addrspace(1) %ptr) #1 {
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
 
+;.
 ; AKF_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ; AKF_CHECK: attributes #[[ATTR1]] = { nounwind }
 ;.
 ; ATTRIBUTOR_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 98658834e897..e369f7e3b9a5 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -3337,7 +3337,7 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1)
 ; GFX11-LABEL: test_inreg_arg_store:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v2, s4
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store bfloat %in, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
index 10d71a315fbf..e1e3220cc275 100644
--- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
@@ -472,7 +472,7 @@ define amdgpu_gfx void @test34(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN-LABEL: test34:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_min_i32 s0, s4, s5
+; GCN-NEXT:    s_min_i32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmpk_lt_i32 s0, 0x3e9
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
@@ -492,7 +492,7 @@ define amdgpu_gfx void @test35(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN-LABEL: test35:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_max_i32 s0, s4, s5
+; GCN-NEXT:    s_max_i32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmpk_gt_i32 s0, 0x3e8
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
@@ -512,9 +512,9 @@ define amdgpu_gfx void @test36(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 ; GCN-LABEL: test36:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_min_u32 s0, s4, s5
+; GCN-NEXT:    s_min_u32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_cmp_lt_u32 s0, s6
+; GCN-NEXT:    s_cmp_lt_u32 s0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_cselect_b32 s0, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
@@ -532,9 +532,9 @@ define amdgpu_gfx void @test37(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 ; GCN-LABEL: test37:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_max_i32 s0, s4, s5
+; GCN-NEXT:    s_max_i32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_cmp_ge_i32 s0, s6
+; GCN-NEXT:    s_cmp_ge_i32 s0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_cselect_b32 s0, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
@@ -552,7 +552,7 @@ define amdgpu_gfx void @test38(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN-LABEL: test38:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_max_u32 s0, s4, s5
+; GCN-NEXT:    s_max_u32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmpk_lt_u32 s0, 0x3e9
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
@@ -572,7 +572,7 @@ define amdgpu_gfx void @test39(i32 inreg %arg1, i32 inreg %arg2) {
 ; GCN-LABEL: test39:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_min_i32 s0, s4, s5
+; GCN-NEXT:    s_min_i32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_cmpk_gt_i32 s0, 0x3e7
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
@@ -592,9 +592,9 @@ define amdgpu_gfx void @test40(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 ; GCN-LABEL: test40:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_max_i32 s0, s4, s5
+; GCN-NEXT:    s_max_i32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_cmp_le_i32 s0, s6
+; GCN-NEXT:    s_cmp_le_i32 s0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_cselect_b32 s0, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
@@ -612,9 +612,9 @@ define amdgpu_gfx void @test41(i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3
 ; GCN-LABEL: test41:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_min_u32 s0, s4, s5
+; GCN-NEXT:    s_min_u32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_cmp_ge_u32 s0, s6
+; GCN-NEXT:    s_cmp_ge_u32 s0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_cselect_b32 s0, -1, 0
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir b/llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir
index 895185cb41a3..577d38e65668 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir
@@ -333,7 +333,7 @@
     ret void
   }
 
-  attributes #0 = { "amdgpu-waves-per-eu"="4,4" }
+  attributes #0 = { "amdgpu-waves-per-eu"="4,4" "amdgpu-no-agpr" }
 
 ...
 ---
diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
index 0c034192869b..386f9cd3f9ce 100644
--- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
@@ -35,6 +35,6 @@ define amdgpu_kernel void @test_direct_indirect_call() {
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
index 0069370cc972..05558c555c58 100644
--- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
+++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
@@ -42,6 +42,6 @@ attributes #0 = { "amdgpu-no-dispatch-id" }
 ;.
 ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" }
 ;.
-; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index b2311a87059c..a69418d43641 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -238,7 +238,7 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB0_4
 ; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 0xfffffbcd, v6
+; GISEL-NEXT:    v_add_u32_e32 v6, 0xfffffbcd, v6
 ; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
@@ -612,7 +612,7 @@ define i128 @fptoui_f64_to_i128(double %x) {
 ; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB1_4
 ; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 0xfffffbcd, v6
+; GISEL-NEXT:    v_add_u32_e32 v6, 0xfffffbcd, v6
 ; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
@@ -978,7 +978,7 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB2_4
 ; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 0xffffff6a, v6
+; GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff6a, v6
 ; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
@@ -1338,7 +1338,7 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; GISEL-NEXT:    s_xor_b64 s[16:17], exec, s[6:7]
 ; GISEL-NEXT:    s_cbranch_execz .LBB3_4
 ; GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
-; GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 0xffffff6a, v6
+; GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff6a, v6
 ; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v6, v[4:5]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index 44a9127b4bd0..27845b6b5b2f 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -2176,6 +2176,93 @@ define void @void_func_a5i32_inreg([5 x i32] inreg %arg0, ptr addrspace(1) %ptr)
 declare void @extern()
 
 define void @void_func_a13i32_inreg([13  x i32] inreg %arg0, ptr addrspace(1) %ptr) {
+; GFX9-LABEL: void_func_a13i32_inreg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s27, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[28:29], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[28:29]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s26
+; GFX9-NEXT:    global_store_dword v[0:1], v2, off offset:48
+; GFX9-NEXT:    v_mov_b32_e32 v5, s25
+; GFX9-NEXT:    v_mov_b32_e32 v4, s24
+; GFX9-NEXT:    v_mov_b32_e32 v3, s23
+; GFX9-NEXT:    v_mov_b32_e32 v2, s22
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:32
+; GFX9-NEXT:    v_writelane_b32 v40, s27, 2
+; GFX9-NEXT:    v_mov_b32_e32 v5, s21
+; GFX9-NEXT:    v_mov_b32_e32 v4, s20
+; GFX9-NEXT:    v_mov_b32_e32 v3, s19
+; GFX9-NEXT:    v_mov_b32_e32 v2, s18
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s17
+; GFX9-NEXT:    v_mov_b32_e32 v4, s16
+; GFX9-NEXT:    s_getpc_b64 s[16:17]
+; GFX9-NEXT:    s_add_u32 s16, s16, extern@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s17, s17, extern@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_a13i32_inreg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s23, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s24, -1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    s_mov_b32 exec_lo, s24
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_dual_mov_b32 v4, s20 :: v_dual_mov_b32 v3, s19
+; GFX11-NEXT:    v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v9, s17
+; GFX11-NEXT:    s_getpc_b64 s[18:19]
+; GFX11-NEXT:    s_add_u32 s18, s18, extern@gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s19, s19, extern@gotpcrel32@hi+12
+; GFX11-NEXT:    v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v7, s7
+; GFX11-NEXT:    s_load_b64 s[16:17], s[18:19], 0x0
+; GFX11-NEXT:    v_writelane_b32 v40, s23, 2
+; GFX11-NEXT:    v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v5, s21
+; GFX11-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v13, s3
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1
+; GFX11-NEXT:    v_mov_b32_e32 v10, s0
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_store_b32 v[0:1], v14, off offset:48
+; GFX11-NEXT:    global_store_b128 v[0:1], v[2:5], off offset:32
+; GFX11-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX11-NEXT:    global_store_b128 v[0:1], v[10:13], off
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store [13 x i32] %arg0, ptr addrspace(1) %ptr
   call void @extern()
   ret void
@@ -2203,6 +2290,52 @@ define void @void_func_a13i32_inreg([13  x i32] inreg %arg0, ptr addrspace(1) %p
 
 ; FIXME: Should still fail
 define void @void_func_a16i32_inreg__noimplicit([16 x i32] inreg %arg0, ptr addrspace(1) %ptr) {
+; GFX9-LABEL: void_func_a16i32_inreg__noimplicit:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v5, s19
+; GFX9-NEXT:    v_mov_b32_e32 v4, s18
+; GFX9-NEXT:    v_mov_b32_e32 v3, s17
+; GFX9-NEXT:    v_mov_b32_e32 v2, s16
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:48
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s15
+; GFX9-NEXT:    v_mov_b32_e32 v4, s14
+; GFX9-NEXT:    v_mov_b32_e32 v3, s13
+; GFX9-NEXT:    v_mov_b32_e32 v2, s12
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:32
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s11
+; GFX9-NEXT:    v_mov_b32_e32 v4, s10
+; GFX9-NEXT:    v_mov_b32_e32 v3, s9
+; GFX9-NEXT:    v_mov_b32_e32 v2, s8
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s7
+; GFX9-NEXT:    v_mov_b32_e32 v4, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_a16i32_inreg__noimplicit:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v5, s15 :: v_dual_mov_b32 v4, s14
+; GFX11-NEXT:    v_dual_mov_b32 v3, s13 :: v_dual_mov_b32 v2, s12
+; GFX11-NEXT:    v_dual_mov_b32 v9, s11 :: v_dual_mov_b32 v8, s10
+; GFX11-NEXT:    v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v6, s8
+; GFX11-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX11-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX11-NEXT:    v_dual_mov_b32 v17, s3 :: v_dual_mov_b32 v16, s2
+; GFX11-NEXT:    v_dual_mov_b32 v15, s1 :: v_dual_mov_b32 v14, s0
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    global_store_b128 v[0:1], v[2:5], off offset:48
+; GFX11-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:32
+; GFX11-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:16
+; GFX11-NEXT:    global_store_b128 v[0:1], v[14:17], off
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store [16 x i32] %arg0, ptr addrspace(1) %ptr
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index a118fa388f86..3e1db5fb4e1d 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -9567,19 +9567,17 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i8_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i8_inreg@abs32@lo
-; GFX9-NEXT:    s_movk_i32 s4, 0x7b
+; GFX9-NEXT:    s_movk_i32 s0, 0x7b
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -9597,19 +9595,17 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i8_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i8_inreg@abs32@lo
+; GFX10-NEXT:    s_movk_i32 s0, 0x7b
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_movk_i32 s4, 0x7b
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9627,20 +9623,18 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i8_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i8_inreg@abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_i8_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_i8_inreg@abs32@lo
+; GFX11-NEXT:    s_movk_i32 s0, 0x7b
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_movk_i32 s4, 0x7b
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -9658,19 +9652,17 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i8_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i8_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_i8_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_i8_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_movk_i32 s0, 0x7b
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x7b
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9692,19 +9684,17 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i16_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i16_inreg@abs32@lo
-; GFX9-NEXT:    s_movk_i32 s4, 0x7b
+; GFX9-NEXT:    s_movk_i32 s0, 0x7b
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -9722,19 +9712,17 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i16_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i16_inreg@abs32@lo
+; GFX10-NEXT:    s_movk_i32 s0, 0x7b
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_movk_i32 s4, 0x7b
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9752,20 +9740,18 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i16_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i16_inreg@abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_i16_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_i16_inreg@abs32@lo
+; GFX11-NEXT:    s_movk_i32 s0, 0x7b
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_movk_i32 s4, 0x7b
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -9783,19 +9769,17 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i16_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i16_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_i16_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_i16_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_movk_i32 s0, 0x7b
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x7b
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9817,19 +9801,17 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i32_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i32_inreg@abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 42
+; GFX9-NEXT:    s_mov_b32 s0, 42
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -9847,19 +9829,17 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i32_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i32_inreg@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 42
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 42
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9877,20 +9857,18 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i32_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i32_inreg@abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_i32_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_i32_inreg@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 42
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 42
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -9908,19 +9886,17 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i32_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i32_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_i32_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_i32_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 42
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 42
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -9942,22 +9918,18 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i64_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i64_inreg@abs32@lo
-; GFX9-NEXT:    s_movk_i32 s4, 0x7b
-; GFX9-NEXT:    s_mov_b32 s5, 0
+; GFX9-NEXT:    s_movk_i32 s0, 0x7b
+; GFX9-NEXT:    s_mov_b32 s1, 0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -9975,22 +9947,18 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i64_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i64_inreg@abs32@lo
+; GFX10-NEXT:    s_movk_i32 s0, 0x7b
+; GFX10-NEXT:    s_mov_b32 s1, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_movk_i32 s4, 0x7b
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10008,23 +9976,19 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i64_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i64_inreg@abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_i64_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_i64_inreg@abs32@lo
+; GFX11-NEXT:    s_movk_i32 s0, 0x7b
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_movk_i32 s4, 0x7b
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -10042,22 +10006,18 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i64_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i64_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_i64_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_i64_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_movk_i32 s0, 0x7b
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x7b
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10079,26 +10039,23 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX9-NEXT:    s_mov_b64 s[34:35], 0
-; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX9-NEXT:    s_load_dwordx4 s[36:39], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s36
+; GFX9-NEXT:    s_mov_b32 s1, s37
+; GFX9-NEXT:    s_mov_b32 s2, s38
+; GFX9-NEXT:    s_mov_b32 s3, s39
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -10116,26 +10073,23 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[36:39], s[34:35], 0x0
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s0, s36
+; GFX10-NEXT:    s_mov_b32 s1, s37
+; GFX10-NEXT:    s_mov_b32 s2, s38
+; GFX10-NEXT:    s_mov_b32 s3, s39
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10153,27 +10107,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX11-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i64_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i64_inreg@abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -10191,26 +10137,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b64 s[0:1], 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i64_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i64_inreg@abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10233,28 +10171,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 1
-; GFX9-NEXT:    s_mov_b32 s5, 2
-; GFX9-NEXT:    s_mov_b32 s6, 3
-; GFX9-NEXT:    s_mov_b32 s7, 4
+; GFX9-NEXT:    s_mov_b32 s0, 1
+; GFX9-NEXT:    s_mov_b32 s1, 2
+; GFX9-NEXT:    s_mov_b32 s2, 3
+; GFX9-NEXT:    s_mov_b32 s3, 4
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -10272,28 +10202,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 1
+; GFX10-NEXT:    s_mov_b32 s1, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s2, 3
+; GFX10-NEXT:    s_mov_b32 s3, 4
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    s_mov_b32 s6, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_mov_b32 s7, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10311,29 +10233,21 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i64_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i64_inreg@abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 1
+; GFX11-NEXT:    s_mov_b32 s1, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_mov_b32 s2, 3
+; GFX11-NEXT:    s_mov_b32 s3, 4
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    s_mov_b32 s6, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_mov_b32 s7, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -10351,28 +10265,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i64_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i64_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v2i64_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v2i64_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, 4
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10394,32 +10300,29 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    s_load_dwordx4 s[36:39], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    s_mov_b64 s[34:35], 0
-; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i64_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i64_inreg@abs32@lo
-; GFX9-NEXT:    s_mov_b32 s8, 1
-; GFX9-NEXT:    s_mov_b32 s9, 2
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s36
+; GFX9-NEXT:    s_mov_b32 s1, s37
+; GFX9-NEXT:    s_mov_b32 s2, s38
+; GFX9-NEXT:    s_mov_b32 s3, s39
+; GFX9-NEXT:    s_mov_b32 s4, 1
+; GFX9-NEXT:    s_mov_b32 s5, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -10437,32 +10340,29 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
 ; GFX10-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[36:39], s[34:35], 0x0
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i64_inreg@abs32@hi
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i64_inreg@abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-NEXT:    s_mov_b32 s8, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX10-NEXT:    s_mov_b32 s9, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 6
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 7
+; GFX10-NEXT:    s_mov_b32 s4, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    s_mov_b32 s5, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s0, s36
+; GFX10-NEXT:    s_mov_b32 s1, s37
+; GFX10-NEXT:    s_mov_b32 s2, s38
+; GFX10-NEXT:    s_mov_b32 s3, s39
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10480,33 +10380,25 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 8
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
 ; GFX11-NEXT:    s_mov_b64 s[0:1], 0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v3i64_inreg@abs32@hi
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v3i64_inreg@abs32@lo
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 1
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i64_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i64_inreg@abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX11-NEXT:    s_mov_b32 s8, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX11-NEXT:    s_mov_b32 s9, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 7
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s5, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 8
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -10524,32 +10416,24 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 8
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
 ; GFX10-SCRATCH-NEXT:    s_mov_b64 s[0:1], 0
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v3i64_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v3i64_inreg@abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i64_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i64_inreg@abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s9, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 6
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 7
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 6
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10574,38 +10458,35 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    s_load_dwordx4 s[36:39], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    s_mov_b64 s[34:35], 0
-; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo
-; GFX9-NEXT:    s_mov_b32 s8, 1
-; GFX9-NEXT:    s_mov_b32 s9, 2
-; GFX9-NEXT:    s_mov_b32 s10, 3
-; GFX9-NEXT:    s_mov_b32 s11, 4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s36
+; GFX9-NEXT:    s_mov_b32 s1, s37
+; GFX9-NEXT:    s_mov_b32 s2, s38
+; GFX9-NEXT:    s_mov_b32 s3, s39
+; GFX9-NEXT:    s_mov_b32 s4, 1
+; GFX9-NEXT:    s_mov_b32 s5, 2
+; GFX9-NEXT:    s_mov_b32 s6, 3
+; GFX9-NEXT:    s_mov_b32 s7, 4
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -10623,38 +10504,35 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
 ; GFX10-NEXT:    s_mov_b64 s[34:35], 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_load_dwordx4 s[36:39], s[34:35], 0x0
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s4, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    s_mov_b32 s5, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-NEXT:    s_mov_b32 s6, 3
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
-; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-NEXT:    s_mov_b32 s8, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX10-NEXT:    s_mov_b32 s9, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX10-NEXT:    s_mov_b32 s10, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
-; GFX10-NEXT:    s_mov_b32 s11, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
+; GFX10-NEXT:    s_mov_b32 s7, 4
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s0, s36
+; GFX10-NEXT:    s_mov_b32 s1, s37
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-NEXT:    s_mov_b32 s2, s38
+; GFX10-NEXT:    s_mov_b32 s3, s39
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10672,39 +10550,31 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 10
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
 ; GFX11-NEXT:    s_mov_b64 s[0:1], 0
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s4, 1
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX11-NEXT:    s_mov_b32 s5, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX11-NEXT:    s_mov_b32 s6, 3
 ; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i64_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i64_inreg@abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX11-NEXT:    s_mov_b32 s8, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX11-NEXT:    s_mov_b32 s9, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX11-NEXT:    s_mov_b32 s10, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s11, 7
-; GFX11-NEXT:    s_mov_b32 s11, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s7, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX11-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 10
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -10722,38 +10592,30 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 10
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
 ; GFX10-SCRATCH-NEXT:    s_mov_b64 s[0:1], 0
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v4i64_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v4i64_inreg@abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 3
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i64_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i64_inreg@abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s9, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s10, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s11, 7
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s11, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 10
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10777,19 +10639,17 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_f16_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_f16_inreg@abs32@lo
-; GFX9-NEXT:    s_movk_i32 s4, 0x4400
+; GFX9-NEXT:    s_movk_i32 s0, 0x4400
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -10807,19 +10667,17 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_f16_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_f16_inreg@abs32@lo
+; GFX10-NEXT:    s_movk_i32 s0, 0x4400
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_movk_i32 s4, 0x4400
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10837,20 +10695,18 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_f16_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_f16_inreg@abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_f16_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_f16_inreg@abs32@lo
+; GFX11-NEXT:    s_movk_i32 s0, 0x4400
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_movk_i32 s4, 0x4400
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -10868,19 +10724,17 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_f16_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_f16_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_f16_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_f16_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_movk_i32 s0, 0x4400
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_movk_i32 s4, 0x4400
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10902,19 +10756,17 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_f32_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_f32_inreg@abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 4.0
+; GFX9-NEXT:    s_mov_b32 s0, 4.0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -10932,19 +10784,17 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_f32_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_f32_inreg@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 4.0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 4.0
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -10962,20 +10812,18 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_f32_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_f32_inreg@abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_f32_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_f32_inreg@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 4.0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 4.0
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -10993,19 +10841,17 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_f32_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_f32_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_f32_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_f32_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 4.0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 4.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11027,22 +10873,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2f32_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2f32_inreg@abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 1.0
-; GFX9-NEXT:    s_mov_b32 s5, 2.0
+; GFX9-NEXT:    s_mov_b32 s0, 1.0
+; GFX9-NEXT:    s_mov_b32 s1, 2.0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -11060,22 +10902,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2f32_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2f32_inreg@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 1.0
+; GFX10-NEXT:    s_mov_b32 s1, 2.0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1.0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11093,23 +10931,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2f32_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2f32_inreg@abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v2f32_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v2f32_inreg@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 1.0
+; GFX11-NEXT:    s_mov_b32 s1, 2.0
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1.0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 2.0
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -11127,22 +10961,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2f32_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2f32_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v2f32_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v2f32_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 1.0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 2.0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11164,25 +10994,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3f32_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3f32_inreg@abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 1.0
-; GFX9-NEXT:    s_mov_b32 s5, 2.0
-; GFX9-NEXT:    s_mov_b32 s6, 4.0
+; GFX9-NEXT:    s_mov_b32 s0, 1.0
+; GFX9-NEXT:    s_mov_b32 s1, 2.0
+; GFX9-NEXT:    s_mov_b32 s2, 4.0
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -11200,25 +11024,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3f32_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3f32_inreg@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 1.0
+; GFX10-NEXT:    s_mov_b32 s1, 2.0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s2, 4.0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1.0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    s_mov_b32 s6, 4.0
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11236,26 +11054,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 5
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3f32_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3f32_inreg@abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v3f32_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v3f32_inreg@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 1.0
+; GFX11-NEXT:    s_mov_b32 s1, 2.0
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_mov_b32 s2, 4.0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1.0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 2.0
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    s_mov_b32 s6, 4.0
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 4
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -11273,25 +11085,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 5
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3f32_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3f32_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v3f32_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v3f32_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 1.0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 2.0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 4.0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 4.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 4
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11313,31 +11119,23 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v5f32_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v5f32_inreg@abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 1.0
-; GFX9-NEXT:    s_mov_b32 s5, 2.0
-; GFX9-NEXT:    s_mov_b32 s6, 4.0
-; GFX9-NEXT:    s_mov_b32 s7, -1.0
-; GFX9-NEXT:    s_mov_b32 s8, 0.5
+; GFX9-NEXT:    s_mov_b32 s0, 1.0
+; GFX9-NEXT:    s_mov_b32 s1, 2.0
+; GFX9-NEXT:    s_mov_b32 s2, 4.0
+; GFX9-NEXT:    s_mov_b32 s3, -1.0
+; GFX9-NEXT:    s_mov_b32 s4, 0.5
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -11355,31 +11153,23 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 7
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v5f32_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v5f32_inreg@abs32@lo
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_mov_b32 s0, 1.0
+; GFX10-NEXT:    s_mov_b32 s1, 2.0
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1.0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    s_mov_b32 s6, 4.0
-; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_mov_b32 s7, -1.0
-; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-NEXT:    s_mov_b32 s8, 0.5
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 5
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 6
+; GFX10-NEXT:    s_mov_b32 s2, 4.0
+; GFX10-NEXT:    s_mov_b32 s3, -1.0
+; GFX10-NEXT:    s_mov_b32 s4, 0.5
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 7
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11397,32 +11187,24 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 7
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v5f32_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v5f32_inreg@abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v5f32_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v5f32_inreg@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 1.0
+; GFX11-NEXT:    s_mov_b32 s1, 2.0
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1.0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 2.0
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    s_mov_b32 s6, 4.0
-; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_mov_b32 s7, -1.0
-; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX11-NEXT:    s_mov_b32 s8, 0.5
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 6
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s2, 4.0
+; GFX11-NEXT:    s_mov_b32 s3, -1.0
+; GFX11-NEXT:    s_mov_b32 s4, 0.5
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 7
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -11440,31 +11222,23 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 7
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v5f32_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v5f32_inreg@abs32@lo
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v5f32_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v5f32_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 1.0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 2.0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 4.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, -1.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 0.5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 6
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 4.0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, -1.0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0.5
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 7
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11486,22 +11260,18 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_f64_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_f64_inreg@abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0x40100000
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    s_mov_b32 s1, 0x40100000
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -11519,22 +11289,18 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_f64_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_f64_inreg@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    s_mov_b32 s1, 0x40100000
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 0x40100000
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11552,23 +11318,19 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_f64_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_f64_inreg@abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_f64_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_f64_inreg@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s1, 0x40100000
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 0x40100000
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -11586,22 +11348,18 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_f64_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_f64_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_f64_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_f64_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 0x40100000
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 0x40100000
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11623,28 +11381,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2f64_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2f64_inreg@abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 2.0
-; GFX9-NEXT:    s_mov_b32 s6, 0
-; GFX9-NEXT:    s_mov_b32 s7, 0x40100000
+; GFX9-NEXT:    s_mov_b32 s2, 0
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    s_mov_b32 s1, 2.0
+; GFX9-NEXT:    s_mov_b32 s3, 0x40100000
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -11662,28 +11412,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2f64_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2f64_inreg@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s1, 2.0
+; GFX10-NEXT:    s_mov_b32 s3, 0x40100000
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    s_mov_b32 s6, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_mov_b32 s7, 0x40100000
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11701,29 +11443,21 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2f64_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2f64_inreg@abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v2f64_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v2f64_inreg@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s1, 2.0
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_mov_b32 s2, 0
+; GFX11-NEXT:    s_mov_b32 s3, 0x40100000
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 2.0
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    s_mov_b32 s6, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_mov_b32 s7, 0x40100000
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -11741,28 +11475,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2f64_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2f64_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v2f64_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v2f64_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 2.0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, 0x40100000
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 0x40100000
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11784,34 +11510,26 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3f64_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3f64_inreg@abs32@lo
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    s_mov_b32 s1, 2.0
+; GFX9-NEXT:    s_mov_b32 s2, 0
+; GFX9-NEXT:    s_mov_b32 s3, 0x40100000
 ; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 2.0
-; GFX9-NEXT:    s_mov_b32 s6, 0
-; GFX9-NEXT:    s_mov_b32 s7, 0x40100000
-; GFX9-NEXT:    s_mov_b32 s8, 0
-; GFX9-NEXT:    s_mov_b32 s9, 0x40200000
+; GFX9-NEXT:    s_mov_b32 s5, 0x40200000
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 7
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -11829,34 +11547,26 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3f64_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3f64_inreg@abs32@lo
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    s_mov_b32 s1, 2.0
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:    s_mov_b32 s3, 0x40100000
 ; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    s_mov_b32 s6, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_mov_b32 s7, 0x40100000
-; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-NEXT:    s_mov_b32 s8, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX10-NEXT:    s_mov_b32 s9, 0x40200000
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 6
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 7
+; GFX10-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 7
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 8
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11874,35 +11584,27 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 8
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3f64_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3f64_inreg@abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v3f64_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v3f64_inreg@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_mov_b32 s1, 2.0
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    s_mov_b32 s2, 0
+; GFX11-NEXT:    s_mov_b32 s3, 0x40100000
 ; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 2.0
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    s_mov_b32 s6, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_mov_b32 s7, 0x40100000
-; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX11-NEXT:    s_mov_b32 s8, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX11-NEXT:    s_mov_b32 s9, 0x40200000
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 7
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 8
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -11920,34 +11622,26 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 8
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3f64_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3f64_inreg@abs32@lo
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v3f64_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v3f64_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 2.0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, 0x40100000
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2.0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 0x40100000
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s9, 0x40200000
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 6
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 7
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 7
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 6
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 0x40200000
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 8
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -11969,19 +11663,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    s_load_dword s4, s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    s_load_dword s0, s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i16_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i16_inreg@abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -11999,19 +11691,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_load_dword s4, s[34:35], 0x0
+; GFX10-NEXT:    s_load_dword s0, s[34:35], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i16_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i16_inreg@abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12029,20 +11719,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v2i16_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v2i16_inreg@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i16_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i16_inreg@abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12060,19 +11748,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v2i16_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v2i16_inreg@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i16_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i16_inreg@abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12095,21 +11781,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i16_inreg@abs32@hi
-; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i16_inreg@abs32@lo
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s37, external_void_func_v3i16_inreg@abs32@hi
+; GFX9-NEXT:    s_mov_b32 s36, external_void_func_v3i16_inreg@abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s1, s35
+; GFX9-NEXT:    s_mov_b32 s0, s34
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -12127,21 +11812,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX10-NEXT:    s_mov_b32 s37, external_void_func_v3i16_inreg@abs32@hi
+; GFX10-NEXT:    s_mov_b32 s36, external_void_func_v3i16_inreg@abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
-; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i16_inreg@abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i16_inreg@abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s1, s35
+; GFX10-NEXT:    s_mov_b32 s0, s34
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12159,22 +11843,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v3i16_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v3i16_inreg@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i16_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i16_inreg@abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12192,21 +11872,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v3i16_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v3i16_inreg@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i16_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i16_inreg@abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12229,21 +11905,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3f16_inreg@abs32@hi
-; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3f16_inreg@abs32@lo
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s37, external_void_func_v3f16_inreg@abs32@hi
+; GFX9-NEXT:    s_mov_b32 s36, external_void_func_v3f16_inreg@abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s1, s35
+; GFX9-NEXT:    s_mov_b32 s0, s34
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -12261,21 +11936,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX10-NEXT:    s_mov_b32 s37, external_void_func_v3f16_inreg@abs32@hi
+; GFX10-NEXT:    s_mov_b32 s36, external_void_func_v3f16_inreg@abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
-; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3f16_inreg@abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3f16_inreg@abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s1, s35
+; GFX10-NEXT:    s_mov_b32 s0, s34
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12293,22 +11967,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v3f16_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v3f16_inreg@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3f16_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3f16_inreg@abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12326,21 +11996,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v3f16_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v3f16_inreg@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3f16_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3f16_inreg@abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12363,22 +12029,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i16_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i16_inreg@abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 0x20001
-; GFX9-NEXT:    s_mov_b32 s5, 3
+; GFX9-NEXT:    s_mov_b32 s0, 0x20001
+; GFX9-NEXT:    s_mov_b32 s1, 3
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -12396,22 +12058,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i16_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i16_inreg@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 0x20001
+; GFX10-NEXT:    s_mov_b32 s1, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 0x20001
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12429,23 +12087,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i16_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i16_inreg@abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v3i16_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v3i16_inreg@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 0x20001
+; GFX11-NEXT:    s_mov_b32 s1, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0x20001
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12463,22 +12117,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i16_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i16_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v3i16_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v3i16_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 0x20001
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0x20001
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12500,22 +12150,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3f16_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3f16_inreg@abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 0x40003c00
-; GFX9-NEXT:    s_movk_i32 s5, 0x4400
+; GFX9-NEXT:    s_mov_b32 s0, 0x40003c00
+; GFX9-NEXT:    s_movk_i32 s1, 0x4400
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -12533,22 +12179,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3f16_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3f16_inreg@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 0x40003c00
+; GFX10-NEXT:    s_movk_i32 s1, 0x4400
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 0x40003c00
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_movk_i32 s5, 0x4400
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12566,23 +12208,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3f16_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3f16_inreg@abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v3f16_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v3f16_inreg@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 0x40003c00
+; GFX11-NEXT:    s_movk_i32 s1, 0x4400
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0x40003c00
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_movk_i32 s5, 0x4400
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12600,22 +12238,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3f16_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3f16_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v3f16_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v3f16_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 0x40003c00
+; GFX10-SCRATCH-NEXT:    s_movk_i32 s1, 0x4400
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0x40003c00
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_movk_i32 s5, 0x4400
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12637,21 +12271,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i16_inreg@abs32@hi
-; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i16_inreg@abs32@lo
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s37, external_void_func_v4i16_inreg@abs32@hi
+; GFX9-NEXT:    s_mov_b32 s36, external_void_func_v4i16_inreg@abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s34
+; GFX9-NEXT:    s_mov_b32 s1, s35
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -12669,21 +12302,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX10-NEXT:    s_mov_b32 s37, external_void_func_v4i16_inreg@abs32@hi
+; GFX10-NEXT:    s_mov_b32 s36, external_void_func_v4i16_inreg@abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
-; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i16_inreg@abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i16_inreg@abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s0, s34
+; GFX10-NEXT:    s_mov_b32 s1, s35
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12701,22 +12333,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v4i16_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v4i16_inreg@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i16_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i16_inreg@abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12734,21 +12362,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v4i16_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v4i16_inreg@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i16_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i16_inreg@abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12771,22 +12395,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i16_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i16_inreg@abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 0x20001
-; GFX9-NEXT:    s_mov_b32 s5, 0x40003
+; GFX9-NEXT:    s_mov_b32 s0, 0x20001
+; GFX9-NEXT:    s_mov_b32 s1, 0x40003
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -12804,22 +12424,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i16_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i16_inreg@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 0x20001
+; GFX10-NEXT:    s_mov_b32 s1, 0x40003
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 0x20001
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 0x40003
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12837,23 +12453,19 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i16_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i16_inreg@abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v4i16_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v4i16_inreg@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 0x20001
+; GFX11-NEXT:    s_mov_b32 s1, 0x40003
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 0x20001
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 0x40003
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12871,22 +12483,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i16_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i16_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v4i16_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v4i16_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 0x20001
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 0x40003
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 0x20001
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 0x40003
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12908,19 +12516,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    s_load_dword s4, s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    s_load_dword s0, s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2f16_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2f16_inreg@abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -12938,19 +12544,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_load_dword s4, s[34:35], 0x0
+; GFX10-NEXT:    s_load_dword s0, s[34:35], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2f16_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2f16_inreg@abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -12968,20 +12572,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v2f16_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v2f16_inreg@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2f16_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2f16_inreg@abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -12999,19 +12601,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v2f16_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v2f16_inreg@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2f16_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2f16_inreg@abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13034,21 +12634,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i32_inreg@abs32@hi
-; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i32_inreg@abs32@lo
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    s_mov_b32 s37, external_void_func_v2i32_inreg@abs32@hi
+; GFX9-NEXT:    s_mov_b32 s36, external_void_func_v2i32_inreg@abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s34
+; GFX9-NEXT:    s_mov_b32 s1, s35
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -13066,21 +12665,20 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
+; GFX10-NEXT:    s_mov_b32 s37, external_void_func_v2i32_inreg@abs32@hi
+; GFX10-NEXT:    s_mov_b32 s36, external_void_func_v2i32_inreg@abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[34:35], 0x0
-; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i32_inreg@abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i32_inreg@abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s0, s34
+; GFX10-NEXT:    s_mov_b32 s1, s35
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13098,22 +12696,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v2i32_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v2i32_inreg@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i32_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i32_inreg@abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -13131,21 +12725,17 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v2i32_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v2i32_inreg@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i32_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i32_inreg@abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13168,22 +12758,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i32_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i32_inreg@abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 1
-; GFX9-NEXT:    s_mov_b32 s5, 2
+; GFX9-NEXT:    s_mov_b32 s0, 1
+; GFX9-NEXT:    s_mov_b32 s1, 2
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -13201,22 +12787,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i32_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i32_inreg@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 1
+; GFX10-NEXT:    s_mov_b32 s1, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13234,23 +12816,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i32_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i32_inreg@abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v2i32_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v2i32_inreg@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 1
+; GFX11-NEXT:    s_mov_b32 s1, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -13268,22 +12846,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i32_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i32_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v2i32_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v2i32_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 3
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13305,25 +12879,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i32_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i32_inreg@abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 3
-; GFX9-NEXT:    s_mov_b32 s5, 4
-; GFX9-NEXT:    s_mov_b32 s6, 5
+; GFX9-NEXT:    s_mov_b32 s0, 3
+; GFX9-NEXT:    s_mov_b32 s1, 4
+; GFX9-NEXT:    s_mov_b32 s2, 5
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -13341,25 +12909,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i32_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i32_inreg@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 3
+; GFX10-NEXT:    s_mov_b32 s1, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s2, 5
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    s_mov_b32 s6, 5
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13377,26 +12939,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 5
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i32_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i32_inreg@abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v3i32_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v3i32_inreg@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 3
+; GFX11-NEXT:    s_mov_b32 s1, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_mov_b32 s2, 5
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    s_mov_b32 s6, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 4
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -13414,25 +12970,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 5
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i32_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i32_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v3i32_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v3i32_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 5
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 4
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 5
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13454,28 +13004,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i32_i32_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i32_i32_inreg@abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 3
-; GFX9-NEXT:    s_mov_b32 s5, 4
-; GFX9-NEXT:    s_mov_b32 s6, 5
-; GFX9-NEXT:    s_mov_b32 s7, 6
+; GFX9-NEXT:    s_mov_b32 s0, 3
+; GFX9-NEXT:    s_mov_b32 s1, 4
+; GFX9-NEXT:    s_mov_b32 s2, 5
+; GFX9-NEXT:    s_mov_b32 s3, 6
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -13493,28 +13035,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i32_i32_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i32_i32_inreg@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 3
+; GFX10-NEXT:    s_mov_b32 s1, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s2, 5
+; GFX10-NEXT:    s_mov_b32 s3, 6
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    s_mov_b32 s6, 5
-; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_mov_b32 s7, 6
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13532,29 +13066,21 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i32_i32_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i32_i32_inreg@abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v3i32_i32_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v3i32_i32_inreg@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 3
+; GFX11-NEXT:    s_mov_b32 s1, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_mov_b32 s2, 5
+; GFX11-NEXT:    s_mov_b32 s3, 6
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    s_mov_b32 s6, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_mov_b32 s7, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -13572,28 +13098,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i32_i32_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i32_i32_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v3i32_i32_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v3i32_i32_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 5
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, 6
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 6
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13615,25 +13133,22 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX9-NEXT:    s_load_dwordx4 s[36:39], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s36
+; GFX9-NEXT:    s_mov_b32 s1, s37
+; GFX9-NEXT:    s_mov_b32 s2, s38
+; GFX9-NEXT:    s_mov_b32 s3, s39
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -13651,25 +13166,22 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[34:35], 0x0
+; GFX10-NEXT:    s_load_dwordx4 s[36:39], s[34:35], 0x0
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s0, s36
+; GFX10-NEXT:    s_mov_b32 s1, s37
+; GFX10-NEXT:    s_mov_b32 s2, s38
+; GFX10-NEXT:    s_mov_b32 s3, s39
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13687,26 +13199,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i32_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i32_inreg@abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -13724,25 +13228,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i32_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i32_inreg@abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13765,28 +13261,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 1
-; GFX9-NEXT:    s_mov_b32 s5, 2
-; GFX9-NEXT:    s_mov_b32 s6, 3
-; GFX9-NEXT:    s_mov_b32 s7, 4
+; GFX9-NEXT:    s_mov_b32 s0, 1
+; GFX9-NEXT:    s_mov_b32 s1, 2
+; GFX9-NEXT:    s_mov_b32 s2, 3
+; GFX9-NEXT:    s_mov_b32 s3, 4
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -13804,28 +13292,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, 1
+; GFX10-NEXT:    s_mov_b32 s1, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s2, 3
+; GFX10-NEXT:    s_mov_b32 s3, 4
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    s_mov_b32 s6, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_mov_b32 s7, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13843,29 +13323,21 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i32_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i32_inreg@abs32@lo
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 1
+; GFX11-NEXT:    s_mov_b32 s1, 2
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    s_mov_b32 s2, 3
+; GFX11-NEXT:    s_mov_b32 s3, 4
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    s_mov_b32 s6, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_mov_b32 s7, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -13883,28 +13355,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i32_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i32_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v4i32_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v4i32_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, 4
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -13926,31 +13390,23 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v5i32_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v5i32_inreg@abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 1
-; GFX9-NEXT:    s_mov_b32 s5, 2
-; GFX9-NEXT:    s_mov_b32 s6, 3
-; GFX9-NEXT:    s_mov_b32 s7, 4
-; GFX9-NEXT:    s_mov_b32 s8, 5
+; GFX9-NEXT:    s_mov_b32 s0, 1
+; GFX9-NEXT:    s_mov_b32 s1, 2
+; GFX9-NEXT:    s_mov_b32 s2, 3
+; GFX9-NEXT:    s_mov_b32 s3, 4
+; GFX9-NEXT:    s_mov_b32 s4, 5
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -13968,31 +13424,23 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 7
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v5i32_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v5i32_inreg@abs32@lo
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_mov_b32 s0, 1
+; GFX10-NEXT:    s_mov_b32 s1, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 2
-; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    s_mov_b32 s6, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_mov_b32 s7, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-NEXT:    s_mov_b32 s8, 5
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 5
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 6
+; GFX10-NEXT:    s_mov_b32 s2, 3
+; GFX10-NEXT:    s_mov_b32 s3, 4
+; GFX10-NEXT:    s_mov_b32 s4, 5
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 7
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14010,32 +13458,24 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 7
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v5i32_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v5i32_inreg@abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v5i32_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v5i32_inreg@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 1
+; GFX11-NEXT:    s_mov_b32 s1, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 2
-; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    s_mov_b32 s6, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_mov_b32 s7, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX11-NEXT:    s_mov_b32 s8, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 6
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s2, 3
+; GFX11-NEXT:    s_mov_b32 s3, 4
+; GFX11-NEXT:    s_mov_b32 s4, 5
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 7
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -14053,31 +13493,23 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 7
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v5i32_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v5i32_inreg@abs32@lo
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v5i32_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v5i32_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 6
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 6
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, 4
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 5
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 7
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14099,35 +13531,36 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX9-NEXT:    s_load_dwordx8 s[36:43], s[34:35], 0x0
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s36
+; GFX9-NEXT:    s_mov_b32 s1, s37
+; GFX9-NEXT:    s_mov_b32 s2, s38
+; GFX9-NEXT:    s_mov_b32 s3, s39
+; GFX9-NEXT:    s_mov_b32 s4, s40
+; GFX9-NEXT:    s_mov_b32 s5, s41
+; GFX9-NEXT:    s_mov_b32 s6, s42
+; GFX9-NEXT:    s_mov_b32 s7, s43
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -14145,35 +13578,36 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
 ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[34:35], 0x0
+; GFX10-NEXT:    s_load_dwordx8 s[36:43], s[34:35], 0x0
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s0, s36
+; GFX10-NEXT:    s_mov_b32 s1, s37
+; GFX10-NEXT:    s_mov_b32 s2, s38
+; GFX10-NEXT:    s_mov_b32 s3, s39
+; GFX10-NEXT:    s_mov_b32 s4, s40
+; GFX10-NEXT:    s_mov_b32 s5, s41
+; GFX10-NEXT:    s_mov_b32 s6, s42
+; GFX10-NEXT:    s_mov_b32 s7, s43
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14191,36 +13625,28 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 10
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s11, 7
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_load_b256 s[4:11], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v8i32_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v8i32_inreg@abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_load_b256 s[0:7], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX11-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 10
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -14238,35 +13664,27 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 10
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s11, 7
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v8i32_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v8i32_inreg@abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-SCRATCH-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 10
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14290,40 +13708,32 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo
-; GFX9-NEXT:    s_mov_b32 s4, 1
-; GFX9-NEXT:    s_mov_b32 s5, 2
-; GFX9-NEXT:    s_mov_b32 s6, 3
-; GFX9-NEXT:    s_mov_b32 s7, 4
-; GFX9-NEXT:    s_mov_b32 s8, 5
-; GFX9-NEXT:    s_mov_b32 s9, 6
-; GFX9-NEXT:    s_mov_b32 s10, 7
-; GFX9-NEXT:    s_mov_b32 s11, 8
+; GFX9-NEXT:    s_mov_b32 s0, 1
+; GFX9-NEXT:    s_mov_b32 s1, 2
+; GFX9-NEXT:    s_mov_b32 s2, 3
+; GFX9-NEXT:    s_mov_b32 s3, 4
+; GFX9-NEXT:    s_mov_b32 s4, 5
+; GFX9-NEXT:    s_mov_b32 s5, 6
+; GFX9-NEXT:    s_mov_b32 s6, 7
+; GFX9-NEXT:    s_mov_b32 s7, 8
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 9
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX9-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX9-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX9-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX9-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -14341,40 +13751,32 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 10
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_mov_b32 s0, 1
+; GFX10-NEXT:    s_mov_b32 s1, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-NEXT:    s_mov_b32 s4, 1
+; GFX10-NEXT:    s_mov_b32 s2, 3
+; GFX10-NEXT:    s_mov_b32 s3, 4
+; GFX10-NEXT:    s_mov_b32 s4, 5
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-NEXT:    s_mov_b32 s5, 2
+; GFX10-NEXT:    s_mov_b32 s5, 6
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-NEXT:    s_mov_b32 s6, 3
+; GFX10-NEXT:    s_mov_b32 s6, 7
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-NEXT:    s_mov_b32 s7, 4
-; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-NEXT:    s_mov_b32 s8, 5
-; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX10-NEXT:    s_mov_b32 s9, 6
-; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX10-NEXT:    s_mov_b32 s10, 7
-; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
-; GFX10-NEXT:    s_mov_b32 s11, 8
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 8
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 9
+; GFX10-NEXT:    s_mov_b32 s7, 8
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX10-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX10-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX10-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX10-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 10
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14392,41 +13794,33 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 10
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v8i32_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v8i32_inreg@abs32@lo
-; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s0, 1
+; GFX11-NEXT:    s_mov_b32 s1, 2
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX11-NEXT:    s_mov_b32 s4, 1
+; GFX11-NEXT:    s_mov_b32 s2, 3
+; GFX11-NEXT:    s_mov_b32 s3, 4
+; GFX11-NEXT:    s_mov_b32 s4, 5
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX11-NEXT:    s_mov_b32 s5, 2
+; GFX11-NEXT:    s_mov_b32 s5, 6
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX11-NEXT:    s_mov_b32 s6, 3
+; GFX11-NEXT:    s_mov_b32 s6, 7
 ; GFX11-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX11-NEXT:    s_mov_b32 s7, 4
-; GFX11-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX11-NEXT:    s_mov_b32 s8, 5
-; GFX11-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX11-NEXT:    s_mov_b32 s9, 6
-; GFX11-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX11-NEXT:    s_mov_b32 s10, 7
-; GFX11-NEXT:    v_writelane_b32 v40, s11, 7
-; GFX11-NEXT:    s_mov_b32 s11, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 8
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 9
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s7, 8
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX11-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX11-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX11-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX11-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX11-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 10
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -14444,40 +13838,32 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 10
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v8i32_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v8i32_inreg@abs32@lo
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 6
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v8i32_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v8i32_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 1
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, 4
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s4, 5
+; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s5, 6
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 3
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s6, 7
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 4
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s8, 5
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s9, 6
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s10, 7
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s11, 7
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s11, 8
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 8
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 9
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 9
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 8
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s11, v40, 7
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s10, v40, 6
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s9, v40, 5
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s8, v40, 4
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s7, 8
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 5
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s7, v40, 3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 10
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14499,38 +13885,47 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 18
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 14
 ; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
 ; GFX9-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX9-NEXT:    v_writelane_b32 v40, s9, 5
 ; GFX9-NEXT:    v_writelane_b32 v40, s10, 6
 ; GFX9-NEXT:    v_writelane_b32 v40, s11, 7
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s12, 8
 ; GFX9-NEXT:    v_writelane_b32 v40, s13, 9
 ; GFX9-NEXT:    v_writelane_b32 v40, s14, 10
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s15, 11
-; GFX9-NEXT:    v_writelane_b32 v40, s16, 12
-; GFX9-NEXT:    v_writelane_b32 v40, s17, 13
-; GFX9-NEXT:    v_writelane_b32 v40, s18, 14
-; GFX9-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 16
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 12
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s0, s36
+; GFX9-NEXT:    s_mov_b32 s1, s37
+; GFX9-NEXT:    s_mov_b32 s2, s38
+; GFX9-NEXT:    s_mov_b32 s3, s39
+; GFX9-NEXT:    s_mov_b32 s4, s40
+; GFX9-NEXT:    s_mov_b32 s5, s41
+; GFX9-NEXT:    s_mov_b32 s6, s42
+; GFX9-NEXT:    s_mov_b32 s7, s43
+; GFX9-NEXT:    s_mov_b32 s8, s44
+; GFX9-NEXT:    s_mov_b32 s9, s45
+; GFX9-NEXT:    s_mov_b32 s10, s46
+; GFX9-NEXT:    s_mov_b32 s11, s47
+; GFX9-NEXT:    s_mov_b32 s12, s48
+; GFX9-NEXT:    s_mov_b32 s13, s49
+; GFX9-NEXT:    s_mov_b32 s14, s50
+; GFX9-NEXT:    s_mov_b32 s15, s51
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 17
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 13
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 17
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 16
-; GFX9-NEXT:    v_readlane_b32 s19, v40, 15
-; GFX9-NEXT:    v_readlane_b32 s18, v40, 14
-; GFX9-NEXT:    v_readlane_b32 s17, v40, 13
-; GFX9-NEXT:    v_readlane_b32 s16, v40, 12
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 13
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 12
 ; GFX9-NEXT:    v_readlane_b32 s15, v40, 11
 ; GFX9-NEXT:    v_readlane_b32 s14, v40, 10
 ; GFX9-NEXT:    v_readlane_b32 s13, v40, 9
@@ -14543,7 +13938,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 18
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 14
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -14561,38 +13956,47 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 18
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 14
 ; GFX10-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x0
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX10-NEXT:    v_writelane_b32 v40, s9, 5
 ; GFX10-NEXT:    v_writelane_b32 v40, s10, 6
 ; GFX10-NEXT:    v_writelane_b32 v40, s11, 7
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s0, s36
+; GFX10-NEXT:    s_mov_b32 s1, s37
+; GFX10-NEXT:    s_mov_b32 s2, s38
+; GFX10-NEXT:    s_mov_b32 s3, s39
 ; GFX10-NEXT:    v_writelane_b32 v40, s12, 8
+; GFX10-NEXT:    s_mov_b32 s4, s40
+; GFX10-NEXT:    s_mov_b32 s5, s41
+; GFX10-NEXT:    s_mov_b32 s6, s42
+; GFX10-NEXT:    s_mov_b32 s7, s43
 ; GFX10-NEXT:    v_writelane_b32 v40, s13, 9
+; GFX10-NEXT:    s_mov_b32 s8, s44
+; GFX10-NEXT:    s_mov_b32 s9, s45
+; GFX10-NEXT:    s_mov_b32 s10, s46
+; GFX10-NEXT:    s_mov_b32 s11, s47
 ; GFX10-NEXT:    v_writelane_b32 v40, s14, 10
+; GFX10-NEXT:    s_mov_b32 s12, s48
+; GFX10-NEXT:    s_mov_b32 s13, s49
+; GFX10-NEXT:    s_mov_b32 s14, s50
 ; GFX10-NEXT:    v_writelane_b32 v40, s15, 11
-; GFX10-NEXT:    v_writelane_b32 v40, s16, 12
-; GFX10-NEXT:    v_writelane_b32 v40, s17, 13
-; GFX10-NEXT:    v_writelane_b32 v40, s18, 14
-; GFX10-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
-; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 16
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 17
+; GFX10-NEXT:    s_mov_b32 s15, s51
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 12
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 13
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 17
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 16
-; GFX10-NEXT:    v_readlane_b32 s19, v40, 15
-; GFX10-NEXT:    v_readlane_b32 s18, v40, 14
-; GFX10-NEXT:    v_readlane_b32 s17, v40, 13
-; GFX10-NEXT:    v_readlane_b32 s16, v40, 12
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 13
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 12
 ; GFX10-NEXT:    v_readlane_b32 s15, v40, 11
 ; GFX10-NEXT:    v_readlane_b32 s14, v40, 10
 ; GFX10-NEXT:    v_readlane_b32 s13, v40, 9
@@ -14605,7 +14009,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 18
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 14
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14623,8 +14027,10 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 18
+; GFX11-NEXT:    v_writelane_b32 v40, s0, 14
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
@@ -14638,24 +14044,14 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s13, 9
 ; GFX11-NEXT:    v_writelane_b32 v40, s14, 10
 ; GFX11-NEXT:    v_writelane_b32 v40, s15, 11
-; GFX11-NEXT:    v_writelane_b32 v40, s16, 12
-; GFX11-NEXT:    v_writelane_b32 v40, s17, 13
-; GFX11-NEXT:    v_writelane_b32 v40, s18, 14
-; GFX11-NEXT:    v_writelane_b32 v40, s19, 15
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_load_b512 s[4:19], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v16i32_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v16i32_inreg@abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 17
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_load_b512 s[0:15], s[0:1], 0x0
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 12
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 13
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 17
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 16
-; GFX11-NEXT:    v_readlane_b32 s19, v40, 15
-; GFX11-NEXT:    v_readlane_b32 s18, v40, 14
-; GFX11-NEXT:    v_readlane_b32 s17, v40, 13
-; GFX11-NEXT:    v_readlane_b32 s16, v40, 12
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 13
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 12
 ; GFX11-NEXT:    v_readlane_b32 s15, v40, 11
 ; GFX11-NEXT:    v_readlane_b32 s14, v40, 10
 ; GFX11-NEXT:    v_readlane_b32 s13, v40, 9
@@ -14668,7 +14064,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 18
+; GFX11-NEXT:    v_readlane_b32 s0, v40, 14
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
@@ -14686,8 +14082,10 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 18
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 14
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v16i32_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v16i32_inreg@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
@@ -14701,23 +14099,13 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s13, 9
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s14, 10
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s15, 11
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s16, 12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s17, 13
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s18, 14
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s19, 15
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v16i32_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v16i32_inreg@abs32@lo
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 17
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 17
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 16
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s19, v40, 15
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s18, v40, 14
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s17, v40, 13
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s16, v40, 12
+; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[0:15], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 13
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 13
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 12
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s15, v40, 11
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s14, v40, 10
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s13, v40, 9
@@ -14730,7 +14118,7 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 18
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 14
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
@@ -14771,49 +14159,47 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX9-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX9-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX9-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX9-NEXT:    v_writelane_b32 v40, s22, 18
 ; GFX9-NEXT:    v_writelane_b32 v40, s23, 19
 ; GFX9-NEXT:    v_writelane_b32 v40, s24, 20
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x40
-; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s25, 21
 ; GFX9-NEXT:    v_writelane_b32 v40, s26, 22
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s27, 23
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s28, 24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s46
 ; GFX9-NEXT:    v_writelane_b32 v40, s29, 25
-; GFX9-NEXT:    v_mov_b32_e32 v1, s47
-; GFX9-NEXT:    v_mov_b32_e32 v2, s48
-; GFX9-NEXT:    v_mov_b32_e32 v3, s49
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    v_mov_b32_e32 v0, s50
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 26
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    v_mov_b32_e32 v0, s51
-; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi
-; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo
-; GFX9-NEXT:    s_mov_b32 s20, s36
-; GFX9-NEXT:    s_mov_b32 s21, s37
-; GFX9-NEXT:    s_mov_b32 s22, s38
-; GFX9-NEXT:    s_mov_b32 s23, s39
-; GFX9-NEXT:    s_mov_b32 s24, s40
-; GFX9-NEXT:    s_mov_b32 s25, s41
-; GFX9-NEXT:    s_mov_b32 s26, s42
-; GFX9-NEXT:    s_mov_b32 s27, s43
-; GFX9-NEXT:    s_mov_b32 s28, s44
-; GFX9-NEXT:    s_mov_b32 s29, s45
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 27
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x40
+; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x0
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_mov_b32 s53, external_void_func_v32i32_inreg@abs32@hi
+; GFX9-NEXT:    s_mov_b32 s52, external_void_func_v32i32_inreg@abs32@lo
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s30
+; GFX9-NEXT:    v_mov_b32_e32 v1, s31
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_mov_b32 s0, s36
+; GFX9-NEXT:    s_mov_b32 s1, s37
+; GFX9-NEXT:    s_mov_b32 s2, s38
+; GFX9-NEXT:    s_mov_b32 s3, s39
+; GFX9-NEXT:    s_mov_b32 s4, s40
+; GFX9-NEXT:    s_mov_b32 s5, s41
+; GFX9-NEXT:    s_mov_b32 s6, s42
+; GFX9-NEXT:    s_mov_b32 s7, s43
+; GFX9-NEXT:    s_mov_b32 s8, s44
+; GFX9-NEXT:    s_mov_b32 s9, s45
+; GFX9-NEXT:    s_mov_b32 s10, s46
+; GFX9-NEXT:    s_mov_b32 s11, s47
+; GFX9-NEXT:    s_mov_b32 s12, s48
+; GFX9-NEXT:    s_mov_b32 s13, s49
+; GFX9-NEXT:    s_mov_b32 s14, s50
+; GFX9-NEXT:    s_mov_b32 s15, s51
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[52:53]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 26
 ; GFX9-NEXT:    v_readlane_b32 s29, v40, 25
@@ -14879,47 +14265,46 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX10-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX10-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x40
-; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
-; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX10-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX10-NEXT:    v_writelane_b32 v40, s22, 18
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s46
 ; GFX10-NEXT:    v_writelane_b32 v40, s23, 19
-; GFX10-NEXT:    v_mov_b32_e32 v1, s47
-; GFX10-NEXT:    v_mov_b32_e32 v2, s48
-; GFX10-NEXT:    v_mov_b32_e32 v3, s49
-; GFX10-NEXT:    s_mov_b32 s20, s36
 ; GFX10-NEXT:    v_writelane_b32 v40, s24, 20
-; GFX10-NEXT:    s_mov_b32 s21, s37
-; GFX10-NEXT:    s_mov_b32 s22, s38
-; GFX10-NEXT:    s_mov_b32 s23, s39
-; GFX10-NEXT:    s_mov_b32 s24, s40
 ; GFX10-NEXT:    v_writelane_b32 v40, s25, 21
-; GFX10-NEXT:    s_mov_b32 s25, s41
-; GFX10-NEXT:    v_mov_b32_e32 v4, s50
-; GFX10-NEXT:    v_mov_b32_e32 v5, s51
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
-; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
-; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
-; GFX10-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16
-; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:20
 ; GFX10-NEXT:    v_writelane_b32 v40, s26, 22
-; GFX10-NEXT:    s_mov_b32 s26, s42
 ; GFX10-NEXT:    v_writelane_b32 v40, s27, 23
-; GFX10-NEXT:    s_mov_b32 s27, s43
 ; GFX10-NEXT:    v_writelane_b32 v40, s28, 24
-; GFX10-NEXT:    s_mov_b32 s28, s44
 ; GFX10-NEXT:    v_writelane_b32 v40, s29, 25
-; GFX10-NEXT:    s_mov_b32 s29, s45
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 27
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x40
+; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x0
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s30
+; GFX10-NEXT:    v_mov_b32_e32 v1, s31
+; GFX10-NEXT:    s_mov_b32 s4, s40
+; GFX10-NEXT:    s_mov_b32 s5, s41
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    s_mov_b32 s6, s42
+; GFX10-NEXT:    s_mov_b32 s7, s43
+; GFX10-NEXT:    s_mov_b32 s8, s44
+; GFX10-NEXT:    s_mov_b32 s9, s45
+; GFX10-NEXT:    s_mov_b32 s10, s46
+; GFX10-NEXT:    s_mov_b32 s11, s47
+; GFX10-NEXT:    s_mov_b32 s12, s48
+; GFX10-NEXT:    s_mov_b32 s13, s49
+; GFX10-NEXT:    s_mov_b32 s14, s50
+; GFX10-NEXT:    s_mov_b32 s15, s51
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 s0, s36
+; GFX10-NEXT:    s_mov_b32 s1, s37
+; GFX10-NEXT:    s_mov_b32 s2, s38
+; GFX10-NEXT:    s_mov_b32 s3, s39
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 26
@@ -14970,8 +14355,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 28
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_i32 s2, s32, 16
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
@@ -14988,42 +14373,26 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX11-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX11-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b512 s[36:51], s[0:1], 0x40
-; GFX11-NEXT:    s_load_b512 s[4:19], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v32i32_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v32i32_inreg@abs32@lo
 ; GFX11-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX11-NEXT:    v_writelane_b32 v40, s22, 18
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v4, s50 :: v_dual_mov_b32 v5, s51
 ; GFX11-NEXT:    v_writelane_b32 v40, s23, 19
-; GFX11-NEXT:    v_dual_mov_b32 v0, s46 :: v_dual_mov_b32 v1, s47
-; GFX11-NEXT:    v_dual_mov_b32 v2, s48 :: v_dual_mov_b32 v3, s49
 ; GFX11-NEXT:    v_writelane_b32 v40, s24, 20
-; GFX11-NEXT:    s_mov_b32 s20, s36
-; GFX11-NEXT:    s_mov_b32 s21, s37
-; GFX11-NEXT:    s_mov_b32 s22, s38
-; GFX11-NEXT:    s_mov_b32 s23, s39
 ; GFX11-NEXT:    v_writelane_b32 v40, s25, 21
-; GFX11-NEXT:    s_mov_b32 s24, s40
-; GFX11-NEXT:    s_mov_b32 s25, s41
-; GFX11-NEXT:    scratch_store_b64 off, v[4:5], s2
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
 ; GFX11-NEXT:    v_writelane_b32 v40, s26, 22
-; GFX11-NEXT:    s_mov_b32 s26, s42
 ; GFX11-NEXT:    v_writelane_b32 v40, s27, 23
-; GFX11-NEXT:    s_mov_b32 s27, s43
 ; GFX11-NEXT:    v_writelane_b32 v40, s28, 24
-; GFX11-NEXT:    s_mov_b32 s28, s44
 ; GFX11-NEXT:    v_writelane_b32 v40, s29, 25
-; GFX11-NEXT:    s_mov_b32 s29, s45
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 27
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b512 s[16:31], s[0:1], 0x40
+; GFX11-NEXT:    s_load_b512 s[0:15], s[0:1], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v1, s31
+; GFX11-NEXT:    scratch_store_b64 off, v[0:1], s32
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 26
 ; GFX11-NEXT:    v_readlane_b32 s29, v40, 25
@@ -15071,9 +14440,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 28
-; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    s_add_i32 s2, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
@@ -15090,44 +14458,29 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0x40
-; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v32i32_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v32i32_inreg@abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s22, 18
-; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, s50
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s23, 19
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, s51
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, s46
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, s47
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, s48
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s24, 20
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, s49
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s20, s36
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s21, s37
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s22, s38
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s25, 21
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s23, s39
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s24, s40
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s25, s41
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[4:5], s2
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s26, 22
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s26, s42
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s27, 23
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s27, s43
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s28, 24
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s28, s44
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s29, 25
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s29, s45
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 27
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x40
+; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[0:15], s[34:35], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v32i32_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v32i32_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, s30
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, s31
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[0:1], s32
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 26
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s29, v40, 25
@@ -15196,55 +14549,53 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    v_writelane_b32 v40, s16, 12
 ; GFX9-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX9-NEXT:    v_writelane_b32 v40, s18, 14
-; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s19, 15
 ; GFX9-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX9-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX9-NEXT:    v_writelane_b32 v40, s22, 18
 ; GFX9-NEXT:    v_writelane_b32 v40, s23, 19
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_load_dword s52, s[34:35], 0x0
-; GFX9-NEXT:    ; kill: killed $sgpr34_sgpr35
-; GFX9-NEXT:    ; kill: killed $sgpr34_sgpr35
-; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x40
-; GFX9-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s24, 20
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s25, 21
-; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s26, 22
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v0, s52
 ; GFX9-NEXT:    v_writelane_b32 v40, s27, 23
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    v_mov_b32_e32 v0, s46
 ; GFX9-NEXT:    v_writelane_b32 v40, s28, 24
-; GFX9-NEXT:    v_mov_b32_e32 v1, s47
-; GFX9-NEXT:    v_mov_b32_e32 v2, s48
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
-; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
-; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    v_mov_b32_e32 v0, s49
 ; GFX9-NEXT:    v_writelane_b32 v40, s29, 25
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    v_mov_b32_e32 v0, s50
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_load_dword s52, s[34:35], 0x0
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 26
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    v_mov_b32_e32 v0, s51
-; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi
-; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo
-; GFX9-NEXT:    s_mov_b32 s20, s36
-; GFX9-NEXT:    s_mov_b32 s21, s37
-; GFX9-NEXT:    s_mov_b32 s22, s38
-; GFX9-NEXT:    s_mov_b32 s23, s39
-; GFX9-NEXT:    s_mov_b32 s24, s40
-; GFX9-NEXT:    s_mov_b32 s25, s41
-; GFX9-NEXT:    s_mov_b32 s26, s42
-; GFX9-NEXT:    s_mov_b32 s27, s43
-; GFX9-NEXT:    s_mov_b32 s28, s44
-; GFX9-NEXT:    s_mov_b32 s29, s45
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 27
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x40
+; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x0
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s52
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v0, s30
+; GFX9-NEXT:    s_mov_b32 s53, external_void_func_v32i32_i32_inreg@abs32@hi
+; GFX9-NEXT:    v_mov_b32_e32 v1, s31
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_mov_b32 s52, external_void_func_v32i32_i32_inreg@abs32@lo
+; GFX9-NEXT:    s_mov_b32 s0, s36
+; GFX9-NEXT:    s_mov_b32 s1, s37
+; GFX9-NEXT:    s_mov_b32 s2, s38
+; GFX9-NEXT:    s_mov_b32 s3, s39
+; GFX9-NEXT:    s_mov_b32 s4, s40
+; GFX9-NEXT:    s_mov_b32 s5, s41
+; GFX9-NEXT:    s_mov_b32 s6, s42
+; GFX9-NEXT:    s_mov_b32 s7, s43
+; GFX9-NEXT:    s_mov_b32 s8, s44
+; GFX9-NEXT:    s_mov_b32 s9, s45
+; GFX9-NEXT:    s_mov_b32 s10, s46
+; GFX9-NEXT:    s_mov_b32 s11, s47
+; GFX9-NEXT:    s_mov_b32 s12, s48
+; GFX9-NEXT:    s_mov_b32 s13, s49
+; GFX9-NEXT:    s_mov_b32 s14, s50
+; GFX9-NEXT:    s_mov_b32 s15, s51
+; GFX9-NEXT:    ; kill: killed $sgpr34_sgpr35
+; GFX9-NEXT:    ; kill: killed $sgpr34_sgpr35
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[52:53]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 26
 ; GFX9-NEXT:    v_readlane_b32 s29, v40, 25
@@ -15310,52 +14661,51 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX10-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX10-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_clause 0x2
-; GFX10-NEXT:    s_load_dword s52, s[34:35], 0x0
-; GFX10-NEXT:    ; meta instruction
-; GFX10-NEXT:    ; meta instruction
-; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x40
-; GFX10-NEXT:    s_load_dwordx16 s[4:19], s[34:35], 0x0
-; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX10-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX10-NEXT:    v_writelane_b32 v40, s22, 18
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s52
-; GFX10-NEXT:    v_mov_b32_e32 v1, s47
 ; GFX10-NEXT:    v_writelane_b32 v40, s23, 19
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
-; GFX10-NEXT:    v_mov_b32_e32 v0, s46
-; GFX10-NEXT:    v_mov_b32_e32 v2, s48
-; GFX10-NEXT:    v_mov_b32_e32 v3, s49
 ; GFX10-NEXT:    v_writelane_b32 v40, s24, 20
-; GFX10-NEXT:    s_mov_b32 s20, s36
-; GFX10-NEXT:    s_mov_b32 s21, s37
-; GFX10-NEXT:    s_mov_b32 s22, s38
-; GFX10-NEXT:    s_mov_b32 s23, s39
 ; GFX10-NEXT:    v_writelane_b32 v40, s25, 21
-; GFX10-NEXT:    s_mov_b32 s24, s40
-; GFX10-NEXT:    s_mov_b32 s25, s41
-; GFX10-NEXT:    v_mov_b32_e32 v4, s50
-; GFX10-NEXT:    v_mov_b32_e32 v5, s51
 ; GFX10-NEXT:    v_writelane_b32 v40, s26, 22
-; GFX10-NEXT:    s_mov_b32 s26, s42
-; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
-; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
-; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
-; GFX10-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16
-; GFX10-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:20
 ; GFX10-NEXT:    v_writelane_b32 v40, s27, 23
-; GFX10-NEXT:    s_mov_b32 s27, s43
 ; GFX10-NEXT:    v_writelane_b32 v40, s28, 24
-; GFX10-NEXT:    s_mov_b32 s28, s44
 ; GFX10-NEXT:    v_writelane_b32 v40, s29, 25
-; GFX10-NEXT:    s_mov_b32 s29, s45
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 27
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    s_load_dword s52, s[34:35], 0x0
+; GFX10-NEXT:    ; meta instruction
+; GFX10-NEXT:    ; meta instruction
+; GFX10-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x40
+; GFX10-NEXT:    s_load_dwordx16 s[36:51], s[34:35], 0x0
+; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi
+; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s52
+; GFX10-NEXT:    v_mov_b32_e32 v1, s30
+; GFX10-NEXT:    v_mov_b32_e32 v2, s31
+; GFX10-NEXT:    s_mov_b32 s4, s40
+; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GFX10-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4
+; GFX10-NEXT:    s_mov_b32 s5, s41
+; GFX10-NEXT:    s_mov_b32 s6, s42
+; GFX10-NEXT:    s_mov_b32 s7, s43
+; GFX10-NEXT:    s_mov_b32 s8, s44
+; GFX10-NEXT:    s_mov_b32 s9, s45
+; GFX10-NEXT:    s_mov_b32 s10, s46
+; GFX10-NEXT:    s_mov_b32 s11, s47
+; GFX10-NEXT:    s_mov_b32 s12, s48
+; GFX10-NEXT:    s_mov_b32 s13, s49
+; GFX10-NEXT:    s_mov_b32 s14, s50
+; GFX10-NEXT:    s_mov_b32 s15, s51
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_mov_b32 s0, s36
+; GFX10-NEXT:    s_mov_b32 s1, s37
+; GFX10-NEXT:    s_mov_b32 s2, s38
+; GFX10-NEXT:    s_mov_b32 s3, s39
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 26
@@ -15406,8 +14756,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s0, 28
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_i32 s3, s32, 16
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi
+; GFX11-NEXT:    s_add_i32 s36, s32, 8
 ; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX11-NEXT:    v_writelane_b32 v40, s6, 2
@@ -15424,46 +14774,30 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX11-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX11-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x2
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x0
-; GFX11-NEXT:    s_load_b512 s[36:51], s[0:1], 0x40
-; GFX11-NEXT:    s_load_b512 s[4:19], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v32i32_i32_inreg@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v32i32_i32_inreg@abs32@lo
 ; GFX11-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX11-NEXT:    v_writelane_b32 v40, s22, 18
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v5, s51
 ; GFX11-NEXT:    v_writelane_b32 v40, s23, 19
-; GFX11-NEXT:    v_dual_mov_b32 v4, s50 :: v_dual_mov_b32 v1, s47
-; GFX11-NEXT:    v_dual_mov_b32 v0, s46 :: v_dual_mov_b32 v3, s49
 ; GFX11-NEXT:    v_writelane_b32 v40, s24, 20
-; GFX11-NEXT:    v_mov_b32_e32 v2, s48
-; GFX11-NEXT:    s_add_i32 s2, s32, 24
-; GFX11-NEXT:    s_mov_b32 s20, s36
-; GFX11-NEXT:    s_mov_b32 s21, s37
 ; GFX11-NEXT:    v_writelane_b32 v40, s25, 21
-; GFX11-NEXT:    s_mov_b32 s22, s38
-; GFX11-NEXT:    s_mov_b32 s23, s39
-; GFX11-NEXT:    s_mov_b32 s24, s40
-; GFX11-NEXT:    s_mov_b32 s25, s41
 ; GFX11-NEXT:    v_writelane_b32 v40, s26, 22
-; GFX11-NEXT:    s_mov_b32 s26, s42
-; GFX11-NEXT:    scratch_store_b32 off, v6, s2
-; GFX11-NEXT:    scratch_store_b64 off, v[4:5], s3
-; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
 ; GFX11-NEXT:    v_writelane_b32 v40, s27, 23
-; GFX11-NEXT:    s_mov_b32 s27, s43
 ; GFX11-NEXT:    v_writelane_b32 v40, s28, 24
-; GFX11-NEXT:    s_mov_b32 s28, s44
 ; GFX11-NEXT:    v_writelane_b32 v40, s29, 25
-; GFX11-NEXT:    s_mov_b32 s29, s45
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 27
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b32 s34, s[0:1], 0x0
+; GFX11-NEXT:    s_load_b512 s[16:31], s[0:1], 0x40
+; GFX11-NEXT:    s_load_b512 s[0:15], s[0:1], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s34 :: v_dual_mov_b32 v1, s31
+; GFX11-NEXT:    v_mov_b32_e32 v0, s30
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo
+; GFX11-NEXT:    scratch_store_b32 off, v2, s36
+; GFX11-NEXT:    scratch_store_b64 off, v[0:1], s32
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 26
 ; GFX11-NEXT:    v_readlane_b32 s29, v40, 25
@@ -15511,13 +14845,17 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 28
-; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x0
+; GFX10-SCRATCH-NEXT:    s_load_dword s36, s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    s_add_i32 s3, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s4, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s5, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s6, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, s36
+; GFX10-SCRATCH-NEXT:    s_add_i32 s36, s32, 8
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s8, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s9, 5
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s10, 6
@@ -15530,50 +14868,29 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s17, 13
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s18, 14
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s19, 15
-; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_clause 0x2
-; GFX10-SCRATCH-NEXT:    s_load_dword s2, s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    ; meta instruction
-; GFX10-SCRATCH-NEXT:    ; meta instruction
-; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0x40
-; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v32i32_i32_inreg@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v32i32_i32_inreg@abs32@lo
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s20, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s21, 17
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s22, 18
-; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, s2
-; GFX10-SCRATCH-NEXT:    s_add_i32 s2, s32, 24
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, s50
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s23, 19
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, s51
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, s46
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, s47
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, s48
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s24, 20
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, s49
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s20, s36
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s21, s37
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s22, s38
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s25, 21
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s23, s39
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s24, s40
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s25, s41
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v6, s2
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[4:5], s3
-; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s26, 22
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s26, s42
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s27, 23
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s27, s43
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s28, 24
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s28, s44
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s29, 25
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s29, s45
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 26
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 27
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
+; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[16:31], s[34:35], 0x40
+; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[0:15], s[34:35], 0x0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v32i32_i32_inreg@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v32i32_i32_inreg@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, s30
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, s31
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v2, s36
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[0:1], s32
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 27
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 26
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s29, v40, 25
@@ -17397,6 +16714,7 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_bf16@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_bf16@abs32@lo
+; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
@@ -17423,6 +16741,7 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_bf16@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_bf16@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
@@ -17442,18 +16761,18 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX11-LABEL: test_call_external_void_func_bf16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s1, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_bf16@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_bf16@abs32@lo
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_bf16@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_bf16@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
@@ -17469,19 +16788,19 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_bf16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_bf16@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_bf16@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s1, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_bf16@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_bf16@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
@@ -17511,6 +16830,7 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v1bf16@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v1bf16@abs32@lo
+; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
@@ -17537,6 +16857,7 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v1bf16@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v1bf16@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
@@ -17556,18 +16877,18 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX11-LABEL: test_call_external_void_func_v1bf16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s1, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v1bf16@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v1bf16@abs32@lo
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v1bf16@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v1bf16@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
@@ -17583,19 +16904,19 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v1bf16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v1bf16@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v1bf16@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s1, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v1bf16@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v1bf16@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
@@ -17625,6 +16946,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2bf16@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2bf16@abs32@lo
+; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
@@ -17651,6 +16973,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2bf16@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2bf16@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
@@ -17670,18 +16993,18 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX11-LABEL: test_call_external_void_func_v2bf16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s1, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2bf16@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2bf16@abs32@lo
+; GFX11-NEXT:    s_mov_b32 exec_lo, s2
+; GFX11-NEXT:    v_writelane_b32 v40, s1, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v2bf16@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v2bf16@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
@@ -17697,19 +17020,19 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v2bf16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s2, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2bf16@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2bf16@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s1, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v2bf16@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v2bf16@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
@@ -17739,6 +17062,8 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3bf16@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3bf16@abs32@lo
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
@@ -17765,8 +17090,10 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3bf16@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3bf16@abs32@lo
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
@@ -17784,18 +17111,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX11-LABEL: test_call_external_void_func_v3bf16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s2, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_or_saveexec_b32 s3, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3bf16@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3bf16@abs32@lo
+; GFX11-NEXT:    s_mov_b32 exec_lo, s3
+; GFX11-NEXT:    v_writelane_b32 v40, s2, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v3bf16@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v3bf16@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
@@ -17811,19 +17138,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v3bf16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s3, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3bf16@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3bf16@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s2, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v3bf16@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v3bf16@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
@@ -17853,6 +17180,8 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4bf16@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4bf16@abs32@lo
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
@@ -17879,8 +17208,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4bf16@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4bf16@abs32@lo
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    s_mov_b32 s0, s4
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
@@ -17898,18 +17229,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX11-LABEL: test_call_external_void_func_v4bf16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s2, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_or_saveexec_b32 s3, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4bf16@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4bf16@abs32@lo
+; GFX11-NEXT:    s_mov_b32 exec_lo, s3
+; GFX11-NEXT:    v_writelane_b32 v40, s2, 2
+; GFX11-NEXT:    s_mov_b32 s3, external_void_func_v4bf16@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s2, external_void_func_v4bf16@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
@@ -17925,19 +17256,19 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v4bf16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s3, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4bf16@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4bf16@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s3
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s2, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, external_void_func_v4bf16@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, external_void_func_v4bf16@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[2:3]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
@@ -17967,6 +17298,10 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v8bf16@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v8bf16@abs32@lo
+; GFX9-NEXT:    s_mov_b32 s3, s7
+; GFX9-NEXT:    s_mov_b32 s2, s6
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s0, s4
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
@@ -17993,8 +17328,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v8bf16@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v8bf16@abs32@lo
-; GFX10-NEXT:    s_addk_i32 s32, 0x200
+; GFX10-NEXT:    s_mov_b32 s3, s7
+; GFX10-NEXT:    s_mov_b32 s2, s6
 ; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
@@ -18012,18 +17351,18 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX11-LABEL: test_call_external_void_func_v8bf16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s34, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v8bf16@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v8bf16@abs32@lo
+; GFX11-NEXT:    s_mov_b32 exec_lo, s35
+; GFX11-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v8bf16@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v8bf16@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
@@ -18039,19 +17378,19 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v8bf16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v8bf16@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v8bf16@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v8bf16@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v8bf16@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
@@ -18077,16 +17416,32 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 4
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v16bf16@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v16bf16@abs32@lo
+; GFX9-NEXT:    s_mov_b32 s3, s7
+; GFX9-NEXT:    s_mov_b32 s2, s6
+; GFX9-NEXT:    s_mov_b32 s1, s5
+; GFX9-NEXT:    s_mov_b32 s0, s4
+; GFX9-NEXT:    s_mov_b32 s4, s8
+; GFX9-NEXT:    s_mov_b32 s5, s9
+; GFX9-NEXT:    s_mov_b32 s6, s10
+; GFX9-NEXT:    s_mov_b32 s7, s11
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
@@ -18104,16 +17459,32 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 6
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v16bf16@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v16bf16@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s3, s7
+; GFX10-NEXT:    s_mov_b32 s2, s6
+; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_mov_b32 s4, s8
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s5, 1
+; GFX10-NEXT:    s_mov_b32 s5, s9
+; GFX10-NEXT:    v_writelane_b32 v40, s6, 2
+; GFX10-NEXT:    s_mov_b32 s6, s10
+; GFX10-NEXT:    v_writelane_b32 v40, s7, 3
+; GFX10-NEXT:    s_mov_b32 s7, s11
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 4
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 5
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 5
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 4
+; GFX10-NEXT:    v_readlane_b32 s7, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
@@ -18126,18 +17497,18 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX11-LABEL: test_call_external_void_func_v16bf16_inreg:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s34, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
-; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v16bf16@abs32@hi
-; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v16bf16@abs32@lo
+; GFX11-NEXT:    s_mov_b32 exec_lo, s35
+; GFX11-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX11-NEXT:    s_mov_b32 s35, external_void_func_v16bf16@abs32@hi
+; GFX11-NEXT:    s_mov_b32 s34, external_void_func_v16bf16@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
@@ -18153,19 +17524,19 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX10-SCRATCH-LABEL: test_call_external_void_func_v16bf16_inreg:
 ; GFX10-SCRATCH:       ; %bb.0:
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v16bf16@abs32@hi
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v16bf16@abs32@lo
+; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s35
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s35, external_void_func_v16bf16@abs32@hi
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s34, external_void_func_v16bf16@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
diff --git a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
index a5792bf29ddc..4c21f8729745 100644
--- a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
@@ -258,25 +258,25 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo
 
 ;.
 ; V4: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; V4: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR4]] = { "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR5]] = { "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
 ;.
 ; V5: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; V5: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR4]] = { "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR5]] = { "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
 ;.
 ; V6: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; V6: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR4]] = { "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR5]] = { "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
 ;.
 ; V4: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 400}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 7799b9509ceb..25c684004446 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -847,11 +847,11 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-LABEL: test_indirect_call_vgpr_ptr_inreg_arg:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s5, s33
+; GCN-NEXT:    s_mov_b32 s10, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 1
@@ -885,19 +885,19 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-NEXT:    v_writelane_b32 v40, s61, 29
 ; GCN-NEXT:    v_writelane_b32 v40, s62, 30
 ; GCN-NEXT:    v_writelane_b32 v40, s63, 31
-; GCN-NEXT:    s_mov_b64 s[6:7], exec
-; GCN-NEXT:    s_movk_i32 s4, 0x7b
+; GCN-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s8, v0
-; GCN-NEXT:    v_readfirstlane_b32 s9, v1
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GCN-NEXT:    s_and_saveexec_b64 s[10:11], vcc
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GCN-NEXT:    v_readfirstlane_b32 s6, v0
+; GCN-NEXT:    v_readfirstlane_b32 s7, v1
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1]
+; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-NEXT:    s_movk_i32 s0, 0x7b
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
-; GCN-NEXT:    s_xor_b64 exec, exec, s[10:11]
+; GCN-NEXT:    s_xor_b64 exec, exec, s[8:9]
 ; GCN-NEXT:    s_cbranch_execnz .LBB6_1
 ; GCN-NEXT:  ; %bb.2:
-; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_readlane_b32 s63, v40, 31
 ; GCN-NEXT:    v_readlane_b32 s62, v40, 30
 ; GCN-NEXT:    v_readlane_b32 s61, v40, 29
@@ -930,22 +930,22 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
-; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    s_mov_b32 s33, s5
+; GCN-NEXT:    s_mov_b32 s33, s10
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-LABEL: test_indirect_call_vgpr_ptr_inreg_arg:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s5, s33
+; GISEL-NEXT:    s_mov_b32 s10, s33
 ; GISEL-NEXT:    s_mov_b32 s33, s32
-; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
+; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
 ; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
 ; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
@@ -979,19 +979,19 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GISEL-NEXT:    v_writelane_b32 v40, s61, 29
 ; GISEL-NEXT:    v_writelane_b32 v40, s62, 30
 ; GISEL-NEXT:    v_writelane_b32 v40, s63, 31
-; GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GISEL-NEXT:    s_movk_i32 s4, 0x7b
+; GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
-; GISEL-NEXT:    v_readfirstlane_b32 s8, v0
-; GISEL-NEXT:    v_readfirstlane_b32 s9, v1
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GISEL-NEXT:    s_and_saveexec_b64 s[10:11], vcc
-; GISEL-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GISEL-NEXT:    v_readfirstlane_b32 s6, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s7, v1
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1]
+; GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GISEL-NEXT:    s_movk_i32 s0, 0x7b
+; GISEL-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr0
-; GISEL-NEXT:    s_xor_b64 exec, exec, s[10:11]
+; GISEL-NEXT:    s_xor_b64 exec, exec, s[8:9]
 ; GISEL-NEXT:    s_cbranch_execnz .LBB6_1
 ; GISEL-NEXT:  ; %bb.2:
-; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
+; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    v_readlane_b32 s63, v40, 31
 ; GISEL-NEXT:    v_readlane_b32 s62, v40, 30
 ; GISEL-NEXT:    v_readlane_b32 s61, v40, 29
@@ -1024,11 +1024,11 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
-; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
+; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
-; GISEL-NEXT:    s_mov_b32 s33, s5
+; GISEL-NEXT:    s_mov_b32 s33, s10
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   call amdgpu_gfx void %fptr(i32 inreg 123)
diff --git a/llvm/test/CodeGen/AMDGPU/lds-mixed-absolute-addresses-unused.ll b/llvm/test/CodeGen/AMDGPU/lds-mixed-absolute-addresses-unused.ll
new file mode 100644
index 000000000000..d101d8da5e0f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-mixed-absolute-addresses-unused.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
+
+; This looks like a partially lowered module, but the non-lowered GV isn't used by any kernels.
+; In such cases, LowerModuleLDS is free to leave it in and ignore it, and we want to make sure
+; LowerModuleLDS doesn't crash if it re-runs on such modules.
+@notLowered = addrspace(3) global i32 poison
+@lowered = addrspace(3) global i32 poison, !absolute_symbol !0
+
+@llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @notLowered to ptr)], section "llvm.metadata"
+
+define amdgpu_kernel void @kern(i32 %val0) {
+; CHECK-LABEL: define amdgpu_kernel void @kern(
+; CHECK-SAME: i32 [[VAL0:%.*]]) {
+; CHECK-NEXT:    [[VAL1:%.*]] = add i32 [[VAL0]], 4
+; CHECK-NEXT:    store i32 [[VAL1]], ptr addrspace(3) @lowered, align 4
+; CHECK-NEXT:    ret void
+;
+  %val1 = add i32 %val0, 4
+  store i32 %val1, ptr addrspace(3) @lowered
+  ret void
+}
+
+
+!0 = !{i32 0, i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll b/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll
index b512a43aa102..b1f4f2ef1ef5 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll
@@ -8,7 +8,7 @@
 define amdgpu_kernel void @kern() {
   %val0 = load i32, ptr addrspace(3) @var1
   %val1 = add i32 %val0, 4
-  store i32 %val1, ptr addrspace(3) @var1
+  store i32 %val1, ptr addrspace(3) @var2
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll
new file mode 100644
index 000000000000..f1d946376afe
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll
@@ -0,0 +1,47 @@
+
+; Default O0
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -O0 -cg-opt-level 0 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Unified O0
+; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -unified-lto=full -O0 -cg-opt-level 0 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Default O1
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -O1 -cg-opt-level 1 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Unified O1
+; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -unified-lto=full -O1 -cg-opt-level 1 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Default O2
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -O2 -cg-opt-level 2 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Unified O2
+; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -unified-lto=full -O2 -cg-opt-level 2 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Default O3
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -O3 -cg-opt-level 3 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Unified O3
+; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -unified-lto=full -O3 -cg-opt-level 3 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; First print will be from the New PM during the full LTO pipeline.
+; Second print will be from the legacy PM during the CG pipeline.
+
+; CHECK: Running pass: AMDGPULowerModuleLDSPass on [module]
+; CHECK: ModulePass Manager
+; CHECK:   Lower uses of LDS variables from non-kernel functions
+
+@lds = internal unnamed_addr addrspace(3) global i32 poison, align 4
+
+define amdgpu_kernel void @test() {
+entry:
+  store i32 1, ptr addrspace(3) @lds
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll
index e7488e059ee9..20edbd6c0d0f 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll
@@ -157,27 +157,27 @@ define amdgpu_kernel void @test_preload_hint_kernel_1_call_func(ptr %0) #0 {
 
 define amdgpu_kernel void @test_preload_hint_kernel_1_call_intrinsic(i16 %0) #0 {
 ; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic
-; NO-PRELOAD-SAME: (i16 [[TMP0:%.*]]) #[[ATTR2]] {
+; NO-PRELOAD-SAME: (i16 [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
 ; NO-PRELOAD-NEXT:    call void @llvm.amdgcn.set.prio(i16 [[TMP0]])
 ; NO-PRELOAD-NEXT:    ret void
 ;
 ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic
-; PRELOAD-1-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR2]] {
+; PRELOAD-1-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
 ; PRELOAD-1-NEXT:    call void @llvm.amdgcn.set.prio(i16 [[TMP0]])
 ; PRELOAD-1-NEXT:    ret void
 ;
 ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic
-; PRELOAD-3-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR2]] {
+; PRELOAD-3-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
 ; PRELOAD-3-NEXT:    call void @llvm.amdgcn.set.prio(i16 [[TMP0]])
 ; PRELOAD-3-NEXT:    ret void
 ;
 ; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic
-; PRELOAD-16-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR2]] {
+; PRELOAD-16-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
 ; PRELOAD-16-NEXT:    call void @llvm.amdgcn.set.prio(i16 [[TMP0]])
 ; PRELOAD-16-NEXT:    ret void
 ;
 ; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic
-; PRELOAD-20-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR2]] {
+; PRELOAD-20-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
 ; PRELOAD-20-NEXT:    call void @llvm.amdgcn.set.prio(i16 [[TMP0]])
 ; PRELOAD-20-NEXT:    ret void
 ;
@@ -235,23 +235,23 @@ define amdgpu_kernel void @test_preload_hint_kernel_2_preexisting(i32 inreg %0,
 
 define amdgpu_kernel void @test_preload_hint_kernel_incompatible_attributes(ptr addrspace(4) byref(i32) %0, ptr nest %1) {
 ; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes
-; NO-PRELOAD-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+; NO-PRELOAD-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
 ; NO-PRELOAD-NEXT:    ret void
 ;
 ; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes
-; PRELOAD-1-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+; PRELOAD-1-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
 ; PRELOAD-1-NEXT:    ret void
 ;
 ; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes
-; PRELOAD-3-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+; PRELOAD-3-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
 ; PRELOAD-3-NEXT:    ret void
 ;
 ; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes
-; PRELOAD-16-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+; PRELOAD-16-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
 ; PRELOAD-16-NEXT:    ret void
 ;
 ; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes
-; PRELOAD-20-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+; PRELOAD-20-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
 ; PRELOAD-20-NEXT:    ret void
 ;
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
index d92ba7774bd3..d070dc3b770f 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
@@ -203,13 +203,13 @@ attributes #5 = { "amdgpu-flat-work-group-size"="128,512" }
 attributes #6 = { "amdgpu-flat-work-group-size"="512,512" }
 attributes #7 = { "amdgpu-flat-work-group-size"="64,256" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
index 2df219bd0401..f62f1d57aec8 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
@@ -399,26 +399,26 @@ attributes #17 = { "amdgpu-waves-per-eu"="5,8" }
 attributes #18 = { "amdgpu-waves-per-eu"="9,10" }
 attributes #19 = { "amdgpu-waves-per-eu"="8,9" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR12]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR13]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR14]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR15]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR16]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR17]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR18]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR19]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR20]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR21]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR12]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR13]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR14]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR15]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR16]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR17]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR18]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR19]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR20]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR21]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
index eaef63bbfc3c..c1d647c5d3b9 100644
--- a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
@@ -19,5 +19,5 @@ define void @hoge()  {
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
index 297a056526ca..384a9c4043a1 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
@@ -191,11 +191,11 @@ define amdgpu_kernel void @kernel_lds_recursion() {
 !1 = !{i32 1, !"amdhsa_code_object_version", i32 400}
 
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll b/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll
index 0139c52db1d5..27e9d0fce942 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-addrspaces.ll
@@ -10,7 +10,7 @@ define amdgpu_gfx void @example(<4 x i32> inreg %rsrc, ptr addrspace(5) %src, i3
 ; CHECK-NEXT:    scratch_load_b32 v2, v0, off
 ; CHECK-NEXT:    scratch_load_b32 v3, v3, off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_b64 v[2:3], v1, s[4:7], 0 offen
+; CHECK-NEXT:    buffer_store_b64 v[2:3], v1, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 
   %x0 = load i32, ptr addrspace(5) %src
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll b/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll
index cdaac14833e0..ec1de0222d5d 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll
@@ -6,7 +6,7 @@ define amdgpu_gfx i32 @sink_scratch_pointer(ptr addrspace(5) %stack, i32 inreg %
 ; GCN-LABEL: sink_scratch_pointer:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
+; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_cbranch_scc0 .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %bb2
 ; GCN-NEXT:    scratch_load_b32 v0, v0, off offset:-4
@@ -21,7 +21,7 @@ define amdgpu_gfx i32 @sink_scratch_pointer(ptr addrspace(5) %stack, i32 inreg %
 ; GISEL-LABEL: sink_scratch_pointer:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GISEL-NEXT:    s_cmp_lg_u32 s0, 0
 ; GISEL-NEXT:    s_cbranch_scc0 .LBB0_2
 ; GISEL-NEXT:  ; %bb.1: ; %bb2
 ; GISEL-NEXT:    scratch_load_b32 v0, v0, off offset:-4
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index f229f33664e1..539cfc71a80f 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -73,7 +73,7 @@ define amdgpu_kernel void @test_simple_indirect_call() {
 ;.
 ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" }
 ;.
-; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
 ;.
 
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
index 8d5dc7943164..049db01badac 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
@@ -31,6 +31,6 @@ define amdgpu_kernel void @kernel1() #1 {
 
 attributes #0 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
index 7a6f82d589e6..c9387f196dff 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
@@ -98,7 +98,7 @@ define amdgpu_kernel void @kernel2() #0 {
 attributes #0 = { "uniform-work-group-size"="true" }
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR2]] = { "uniform-work-group-size"="true" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
index c04154c7c23f..7183da2c5efc 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
@@ -41,6 +41,6 @@ define amdgpu_kernel void @kernel3() #2 {
 
 attributes #2 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
index 2d5ff045d12e..6ed04cf63d20 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
@@ -41,7 +41,7 @@ define amdgpu_kernel void @kernel2() #2 {
 
 attributes #1 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
index e8bf6fc8321b..d5ba2fd617c6 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
@@ -52,8 +52,8 @@ attributes #0 = { nounwind }
 attributes #1 = { "uniform-work-group-size"="false" }
 attributes #2 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3]] = { "uniform-work-group-size"="true" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
index 473eea4eedce..7f0dfeaf75c8 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
@@ -101,7 +101,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %m) #1 {
 attributes #0 = { nounwind readnone }
 attributes #1 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
index 221f1a11676f..8616c73ad51c 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
@@ -61,6 +61,6 @@ define amdgpu_kernel void @kernel3() #0 {
 
 attributes #0 = { "uniform-work-group-size"="false" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
index 717d3d975aaf..040799435db4 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
@@ -540,6 +540,7 @@ define internal void @use512vgprs() {
 }
 
 define void @foo() #0 {
+  call void asm sideeffect "; use $0", "a"(i32 0)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/ARM/arm-and-tst-peephole.ll b/llvm/test/CodeGen/ARM/arm-and-tst-peephole.ll
index 365727c9dd27..0795525fba1b 100644
--- a/llvm/test/CodeGen/ARM/arm-and-tst-peephole.ll
+++ b/llvm/test/CodeGen/ARM/arm-and-tst-peephole.ll
@@ -8,10 +8,8 @@
 
 %struct.Foo = type { ptr }
 
-; ARM-LABEL:   foo:
-; THUMB-LABEL: foo:
-; T2-LABEL:    foo:
 define ptr @foo(ptr %this, i32 %acc) nounwind readonly align 2 {
+; ARM-LABEL: foo:
 ; ARM:       @ %bb.0: @ %entry
 ; ARM-NEXT:    add r2, r0, #4
 ; ARM-NEXT:    mov r12, #1
@@ -44,6 +42,7 @@ define ptr @foo(ptr %this, i32 %acc) nounwind readonly align 2 {
 ; ARM-NEXT:    add r0, r0, r1, lsl #2
 ; ARM-NEXT:    mov pc, lr
 ;
+; THUMB-LABEL: foo:
 ; THUMB:       @ %bb.0: @ %entry
 ; THUMB-NEXT:    .save {r4, r5, r7, lr}
 ; THUMB-NEXT:    push {r4, r5, r7, lr}
@@ -91,6 +90,7 @@ define ptr @foo(ptr %this, i32 %acc) nounwind readonly align 2 {
 ; THUMB-NEXT:    pop {r0}
 ; THUMB-NEXT:    bx r0
 ;
+; T2-LABEL: foo:
 ; T2:       @ %bb.0: @ %entry
 ; T2-NEXT:    adds r2, r0, #4
 ; T2-NEXT:    mov.w r12, #1
@@ -125,6 +125,7 @@ define ptr @foo(ptr %this, i32 %acc) nounwind readonly align 2 {
 ; T2-NEXT:    add.w r0, r0, r1, lsl #2
 ; T2-NEXT:    bx lr
 ;
+; V8-LABEL: foo:
 ; V8:       @ %bb.0: @ %entry
 ; V8-NEXT:    adds r2, r0, #4
 ; V8-NEXT:    mov.w r12, #1
@@ -210,11 +211,8 @@ sw.epilog:                                        ; preds = %tailrecurse.switch
 
 %struct.S = type { ptr, [1 x i8] }
 
-; ARM-LABEL: bar:
-; THUMB-LABEL: bar:
-; T2-LABEL: bar:
-; V8-LABEL: bar:
 define internal zeroext i8 @bar(ptr %x, ptr nocapture %y) nounwind readonly {
+; ARM-LABEL: bar:
 ; ARM:       @ %bb.0: @ %entry
 ; ARM-NEXT:    ldrb r2, [r0, #4]
 ; ARM-NEXT:    ands r2, r2, #112
@@ -230,6 +228,7 @@ define internal zeroext i8 @bar(ptr %x, ptr nocapture %y) nounwind readonly {
 ; ARM-NEXT:    mov r0, #1
 ; ARM-NEXT:    mov pc, lr
 ;
+; THUMB-LABEL: bar:
 ; THUMB:       @ %bb.0: @ %entry
 ; THUMB-NEXT:    ldrb r2, [r0, #4]
 ; THUMB-NEXT:    movs r3, #112
@@ -253,6 +252,7 @@ define internal zeroext i8 @bar(ptr %x, ptr nocapture %y) nounwind readonly {
 ; THUMB-NEXT:    ands r0, r1
 ; THUMB-NEXT:    bx lr
 ;
+; T2-LABEL: bar:
 ; T2:       @ %bb.0: @ %entry
 ; T2-NEXT:    ldrb r2, [r0, #4]
 ; T2-NEXT:    ands r2, r2, #112
@@ -270,6 +270,7 @@ define internal zeroext i8 @bar(ptr %x, ptr nocapture %y) nounwind readonly {
 ; T2-NEXT:    movs r0, #1
 ; T2-NEXT:    bx lr
 ;
+; V8-LABEL: bar:
 ; V8:       @ %bb.0: @ %entry
 ; V8-NEXT:    ldrb r2, [r0, #4]
 ; V8-NEXT:    ands r2, r2, #112
diff --git a/llvm/test/CodeGen/ARM/select.ll b/llvm/test/CodeGen/ARM/select.ll
index 4bb79651f040..24ca9aeac7f2 100644
--- a/llvm/test/CodeGen/ARM/select.ll
+++ b/llvm/test/CodeGen/ARM/select.ll
@@ -1,14 +1,25 @@
-; RUN: llc -mtriple=arm-apple-darwin %s -o - | FileCheck %s
-
-; RUN: llc -mtriple=arm-eabi -mattr=+vfp2 %s -o - \
-; RUN:	| FileCheck %s --check-prefix=CHECK-VFP
-
-; RUN: llc -mtriple=thumbv7-apple-darwin -mattr=+neon,+thumb2 %s -o - \
-; RUN:	| FileCheck %s --check-prefix=CHECK-NEON
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=armv7-eabi -mattr=-fpregs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-ARM
+; RUN: llc -mtriple=armv7-eabi -mattr=+vfp2 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-VFP
+; RUN: llc -mtriple=thumbv7-apple-darwin -mattr=+neon,+thumb2 %s -o - | FileCheck %s --check-prefix=CHECK-NEON
 
 define i32 @f1(i32 %a.s) {
-;CHECK-LABEL: f1:
-;CHECK: moveq
+; CHECK-LABEL: f1:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    mov r1, #3
+; CHECK-NEXT:    cmp r0, #4
+; CHECK-NEXT:    movweq r1, #2
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    bx lr
+;
+; CHECK-NEON-LABEL: f1:
+; CHECK-NEON:       @ %bb.0: @ %entry
+; CHECK-NEON-NEXT:    movs r1, #3
+; CHECK-NEON-NEXT:    cmp r0, #4
+; CHECK-NEON-NEXT:    it eq
+; CHECK-NEON-NEXT:    moveq r1, #2
+; CHECK-NEON-NEXT:    mov r0, r1
+; CHECK-NEON-NEXT:    bx lr
 entry:
     %tmp = icmp eq i32 %a.s, 4
     %tmp1.s = select i1 %tmp, i32 2, i32 3
@@ -16,8 +27,22 @@ entry:
 }
 
 define i32 @f2(i32 %a.s) {
-;CHECK-LABEL: f2:
-;CHECK: movgt
+; CHECK-LABEL: f2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    mov r1, #3
+; CHECK-NEXT:    cmp r0, #4
+; CHECK-NEXT:    movwgt r1, #2
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    bx lr
+;
+; CHECK-NEON-LABEL: f2:
+; CHECK-NEON:       @ %bb.0: @ %entry
+; CHECK-NEON-NEXT:    movs r1, #3
+; CHECK-NEON-NEXT:    cmp r0, #4
+; CHECK-NEON-NEXT:    it gt
+; CHECK-NEON-NEXT:    movgt r1, #2
+; CHECK-NEON-NEXT:    mov r0, r1
+; CHECK-NEON-NEXT:    bx lr
 entry:
     %tmp = icmp sgt i32 %a.s, 4
     %tmp1.s = select i1 %tmp, i32 2, i32 3
@@ -25,8 +50,22 @@ entry:
 }
 
 define i32 @f3(i32 %a.s, i32 %b.s) {
-;CHECK-LABEL: f3:
-;CHECK: movlt
+; CHECK-LABEL: f3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    mov r2, #3
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    movwlt r2, #2
+; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    bx lr
+;
+; CHECK-NEON-LABEL: f3:
+; CHECK-NEON:       @ %bb.0: @ %entry
+; CHECK-NEON-NEXT:    movs r2, #3
+; CHECK-NEON-NEXT:    cmp r0, r1
+; CHECK-NEON-NEXT:    it lt
+; CHECK-NEON-NEXT:    movlt r2, #2
+; CHECK-NEON-NEXT:    mov r0, r2
+; CHECK-NEON-NEXT:    bx lr
 entry:
     %tmp = icmp slt i32 %a.s, %b.s
     %tmp1.s = select i1 %tmp, i32 2, i32 3
@@ -34,8 +73,22 @@ entry:
 }
 
 define i32 @f4(i32 %a.s, i32 %b.s) {
-;CHECK-LABEL: f4:
-;CHECK: movle
+; CHECK-LABEL: f4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    mov r2, #3
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    movwle r2, #2
+; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    bx lr
+;
+; CHECK-NEON-LABEL: f4:
+; CHECK-NEON:       @ %bb.0: @ %entry
+; CHECK-NEON-NEXT:    movs r2, #3
+; CHECK-NEON-NEXT:    cmp r0, r1
+; CHECK-NEON-NEXT:    it le
+; CHECK-NEON-NEXT:    movle r2, #2
+; CHECK-NEON-NEXT:    mov r0, r2
+; CHECK-NEON-NEXT:    bx lr
 entry:
     %tmp = icmp sle i32 %a.s, %b.s
     %tmp1.s = select i1 %tmp, i32 2, i32 3
@@ -43,8 +96,22 @@ entry:
 }
 
 define i32 @f5(i32 %a.u, i32 %b.u) {
-;CHECK-LABEL: f5:
-;CHECK: movls
+; CHECK-LABEL: f5:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    mov r2, #3
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    movwls r2, #2
+; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    bx lr
+;
+; CHECK-NEON-LABEL: f5:
+; CHECK-NEON:       @ %bb.0: @ %entry
+; CHECK-NEON-NEXT:    movs r2, #3
+; CHECK-NEON-NEXT:    cmp r0, r1
+; CHECK-NEON-NEXT:    it ls
+; CHECK-NEON-NEXT:    movls r2, #2
+; CHECK-NEON-NEXT:    mov r0, r2
+; CHECK-NEON-NEXT:    bx lr
 entry:
     %tmp = icmp ule i32 %a.u, %b.u
     %tmp1.s = select i1 %tmp, i32 2, i32 3
@@ -52,8 +119,22 @@ entry:
 }
 
 define i32 @f6(i32 %a.u, i32 %b.u) {
-;CHECK-LABEL: f6:
-;CHECK: movhi
+; CHECK-LABEL: f6:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    mov r2, #3
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    movwhi r2, #2
+; CHECK-NEXT:    mov r0, r2
+; CHECK-NEXT:    bx lr
+;
+; CHECK-NEON-LABEL: f6:
+; CHECK-NEON:       @ %bb.0: @ %entry
+; CHECK-NEON-NEXT:    movs r2, #3
+; CHECK-NEON-NEXT:    cmp r0, r1
+; CHECK-NEON-NEXT:    it hi
+; CHECK-NEON-NEXT:    movhi r2, #2
+; CHECK-NEON-NEXT:    mov r0, r2
+; CHECK-NEON-NEXT:    bx lr
 entry:
     %tmp = icmp ugt i32 %a.u, %b.u
     %tmp1.s = select i1 %tmp, i32 2, i32 3
@@ -61,11 +142,61 @@ entry:
 }
 
 define double @f7(double %a, double %b) {
-;CHECK-LABEL: f7:
-;CHECK: movmi
-;CHECK: movpl
-;CHECK-VFP-LABEL: f7:
-;CHECK-VFP: vmovmi
+; CHECK-ARM-LABEL: f7:
+; CHECK-ARM:       @ %bb.0:
+; CHECK-ARM-NEXT:    .save {r4, r5, r11, lr}
+; CHECK-ARM-NEXT:    push {r4, r5, r11, lr}
+; CHECK-ARM-NEXT:    mov r4, r3
+; CHECK-ARM-NEXT:    movw r3, #48758
+; CHECK-ARM-NEXT:    mov r5, r2
+; CHECK-ARM-NEXT:    movw r2, #14680
+; CHECK-ARM-NEXT:    movt r2, #51380
+; CHECK-ARM-NEXT:    movt r3, #16371
+; CHECK-ARM-NEXT:    bl __aeabi_dcmplt
+; CHECK-ARM-NEXT:    cmp r0, #0
+; CHECK-ARM-NEXT:    movwne r4, #0
+; CHECK-ARM-NEXT:    movwne r5, #0
+; CHECK-ARM-NEXT:    movtne r4, #49136
+; CHECK-ARM-NEXT:    mov r0, r5
+; CHECK-ARM-NEXT:    mov r1, r4
+; CHECK-ARM-NEXT:    pop {r4, r5, r11, pc}
+;
+; CHECK-VFP-LABEL: f7:
+; CHECK-VFP:       @ %bb.0:
+; CHECK-VFP-NEXT:    vldr d17, .LCPI6_0
+; CHECK-VFP-NEXT:    vmov d19, r0, r1
+; CHECK-VFP-NEXT:    vmov.f64 d16, #-1.000000e+00
+; CHECK-VFP-NEXT:    vcmp.f64 d19, d17
+; CHECK-VFP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-VFP-NEXT:    vmov d18, r2, r3
+; CHECK-VFP-NEXT:    vmovmi.f64 d18, d16
+; CHECK-VFP-NEXT:    vmov r0, r1, d18
+; CHECK-VFP-NEXT:    bx lr
+; CHECK-VFP-NEXT:    .p2align 3
+; CHECK-VFP-NEXT:  @ %bb.1:
+; CHECK-VFP-NEXT:  .LCPI6_0:
+; CHECK-VFP-NEXT:    .long 3367254360 @ double 1.234
+; CHECK-VFP-NEXT:    .long 1072938614
+;
+; CHECK-NEON-LABEL: f7:
+; CHECK-NEON:       @ %bb.0:
+; CHECK-NEON-NEXT:    vldr d17, LCPI6_0
+; CHECK-NEON-NEXT:    vmov d19, r0, r1
+; CHECK-NEON-NEXT:    vmov d18, r2, r3
+; CHECK-NEON-NEXT:    vcmp.f64 d19, d17
+; CHECK-NEON-NEXT:    vmov.f64 d16, #-1.000000e+00
+; CHECK-NEON-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-NEON-NEXT:    it mi
+; CHECK-NEON-NEXT:    vmovmi.f64 d18, d16
+; CHECK-NEON-NEXT:    vmov r0, r1, d18
+; CHECK-NEON-NEXT:    bx lr
+; CHECK-NEON-NEXT:    .p2align 3
+; CHECK-NEON-NEXT:  @ %bb.1:
+; CHECK-NEON-NEXT:    .data_region
+; CHECK-NEON-NEXT:  LCPI6_0:
+; CHECK-NEON-NEXT:    .long 3367254360 @ double 1.234
+; CHECK-NEON-NEXT:    .long 1072938614
+; CHECK-NEON-NEXT:    .end_data_region
     %tmp = fcmp olt double %a, 1.234e+00
     %tmp1 = select i1 %tmp, double -1.000e+00, double %b
     ret double %tmp1
@@ -77,18 +208,49 @@ define double @f7(double %a, double %b) {
 ; a lack of a custom lowering routine for an ISD::SELECT. This would result in
 ; two "it" blocks in the code: one for the "icmp" and another to move the index
 ; into the constant pool based on the value of the "icmp". If we have one "it"
-; block generated, odds are good that we have close to the ideal code for this:
+; block generated, odds are good that we have close to the ideal code for this.
+define arm_apcscc float @f8(i32 %a) nounwind {
+; CHECK-ARM-LABEL: f8:
+; CHECK-ARM:       @ %bb.0:
+; CHECK-ARM-NEXT:    movw r1, #29905
+; CHECK-ARM-NEXT:    movw r2, #1123
+; CHECK-ARM-NEXT:    movt r1, #16408
+; CHECK-ARM-NEXT:    cmp r0, r2
+; CHECK-ARM-NEXT:    movweq r1, #62390
+; CHECK-ARM-NEXT:    movteq r1, #16285
+; CHECK-ARM-NEXT:    mov r0, r1
+; CHECK-ARM-NEXT:    bx lr
+;
+; CHECK-VFP-LABEL: f8:
+; CHECK-VFP:       @ %bb.0:
+; CHECK-VFP-NEXT:    movw r2, #1123
+; CHECK-VFP-NEXT:    adr r1, .LCPI7_0
+; CHECK-VFP-NEXT:    cmp r0, r2
+; CHECK-VFP-NEXT:    addeq r1, r1, #4
+; CHECK-VFP-NEXT:    ldr r0, [r1]
+; CHECK-VFP-NEXT:    bx lr
+; CHECK-VFP-NEXT:    .p2align 2
+; CHECK-VFP-NEXT:  @ %bb.1:
+; CHECK-VFP-NEXT:  .LCPI7_0:
+; CHECK-VFP-NEXT:    .long 0x401874d1 @ float 2.38212991
+; CHECK-VFP-NEXT:    .long 0x3f9df3b6 @ float 1.23399997
 ;
 ; CHECK-NEON-LABEL: f8:
-; CHECK-NEON:      adr     [[R2:r[0-9]+]], LCPI7_0
-; CHECK-NEON:      movw    [[R3:r[0-9]+]], #1123
-; CHECK-NEON-NEXT: cmp     r0, [[R3]]
-; CHECK-NEON-NEXT: it      eq
-; CHECK-NEON-NEXT: addeq{{.*}} [[R2]], #4
-; CHECK-NEON-NEXT: ldr
-; CHECK-NEON:      bx
-
-define arm_apcscc float @f8(i32 %a) nounwind {
+; CHECK-NEON:       @ %bb.0:
+; CHECK-NEON-NEXT:    adr r1, LCPI7_0
+; CHECK-NEON-NEXT:    movw r2, #1123
+; CHECK-NEON-NEXT:    cmp r0, r2
+; CHECK-NEON-NEXT:    it eq
+; CHECK-NEON-NEXT:    addeq r1, #4
+; CHECK-NEON-NEXT:    ldr r0, [r1]
+; CHECK-NEON-NEXT:    bx lr
+; CHECK-NEON-NEXT:    .p2align 2
+; CHECK-NEON-NEXT:  @ %bb.1:
+; CHECK-NEON-NEXT:    .data_region
+; CHECK-NEON-NEXT:  LCPI7_0:
+; CHECK-NEON-NEXT:    .long 0x401874d1 @ float 2.38212991
+; CHECK-NEON-NEXT:    .long 0x3f9df3b6 @ float 1.23399997
+; CHECK-NEON-NEXT:    .end_data_region
   %tmp = icmp eq i32 %a, 1123
   %tmp1 = select i1 %tmp, float 0x3FF3BE76C0000000, float 0x40030E9A20000000
   ret float %tmp1
@@ -98,10 +260,40 @@ define arm_apcscc float @f8(i32 %a) nounwind {
 ; Glue values can only have a single use, but the following test exposed a
 ; case where a SELECT was lowered with 2 uses of a comparison, causing the
 ; scheduler to assert.
-; CHECK-VFP-LABEL: f9:
-
 declare ptr @objc_msgSend(ptr, ptr, ...)
 define void @f9() optsize {
+; CHECK-LABEL: f9:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, sp, #8
+; CHECK-NEXT:    movw r2, #0
+; CHECK-NEXT:    movw r3, #0
+; CHECK-NEXT:    mov r1, #1065353216
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    movt r2, #16672
+; CHECK-NEXT:    movt r3, #32704
+; CHECK-NEXT:    strd r0, r1, [sp]
+; CHECK-NEXT:    bl objc_msgSend
+; CHECK-NEXT:    add sp, sp, #8
+; CHECK-NEXT:    pop {r11, pc}
+;
+; CHECK-NEON-LABEL: f9:
+; CHECK-NEON:       @ %bb.0: @ %entry
+; CHECK-NEON-NEXT:    str lr, [sp, #-4]!
+; CHECK-NEON-NEXT:    sub sp, #8
+; CHECK-NEON-NEXT:    movs r2, #0
+; CHECK-NEON-NEXT:    movs r3, #0
+; CHECK-NEON-NEXT:    mov.w r0, #1065353216
+; CHECK-NEON-NEXT:    movs r1, #0
+; CHECK-NEON-NEXT:    movt r2, #16672
+; CHECK-NEON-NEXT:    movt r3, #32704
+; CHECK-NEON-NEXT:    strd r1, r0, [sp]
+; CHECK-NEON-NEXT:    bl _objc_msgSend
+; CHECK-NEON-NEXT:    add sp, #8
+; CHECK-NEON-NEXT:    ldr lr, [sp], #4
+; CHECK-NEON-NEXT:    bx lr
 entry:
   %cmp = icmp eq ptr undef, inttoptr (i32 4 to ptr)
   %conv191 = select i1 %cmp, float -3.000000e+00, float 0.000000e+00
@@ -117,36 +309,151 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: f10:
 define float @f10(i32 %a, i32 %b) nounwind uwtable readnone ssp {
-; CHECK-NOT: floatsisf
+; CHECK-ARM-LABEL: f10:
+; CHECK-ARM:       @ %bb.0:
+; CHECK-ARM-NEXT:    mov r2, #0
+; CHECK-ARM-NEXT:    cmp r0, r1
+; CHECK-ARM-NEXT:    moveq r2, #1065353216
+; CHECK-ARM-NEXT:    mov r0, r2
+; CHECK-ARM-NEXT:    bx lr
+;
+; CHECK-VFP-LABEL: f10:
+; CHECK-VFP:       @ %bb.0:
+; CHECK-VFP-NEXT:    vmov.f32 s2, #1.000000e+00
+; CHECK-VFP-NEXT:    vldr s0, .LCPI9_0
+; CHECK-VFP-NEXT:    cmp r0, r1
+; CHECK-VFP-NEXT:    vmoveq.f32 s0, s2
+; CHECK-VFP-NEXT:    vmov r0, s0
+; CHECK-VFP-NEXT:    bx lr
+; CHECK-VFP-NEXT:    .p2align 2
+; CHECK-VFP-NEXT:  @ %bb.1:
+; CHECK-VFP-NEXT:  .LCPI9_0:
+; CHECK-VFP-NEXT:    .long 0x00000000 @ float 0
+;
+; CHECK-NEON-LABEL: f10:
+; CHECK-NEON:       @ %bb.0:
+; CHECK-NEON-NEXT:    vldr s0, LCPI9_0
+; CHECK-NEON-NEXT:    vmov.f32 s2, #1.000000e+00
+; CHECK-NEON-NEXT:    cmp r0, r1
+; CHECK-NEON-NEXT:    it eq
+; CHECK-NEON-NEXT:    vmoveq.f32 s0, s2
+; CHECK-NEON-NEXT:    vmov r0, s0
+; CHECK-NEON-NEXT:    bx lr
+; CHECK-NEON-NEXT:    .p2align 2
+; CHECK-NEON-NEXT:  @ %bb.1:
+; CHECK-NEON-NEXT:    .data_region
+; CHECK-NEON-NEXT:  LCPI9_0:
+; CHECK-NEON-NEXT:    .long 0x00000000 @ float 0
+; CHECK-NEON-NEXT:    .end_data_region
   %1 = icmp eq i32 %a, %b
   %2 = zext i1 %1 to i32
   %3 = sitofp i32 %2 to float
   ret float %3
 }
 
-; CHECK-LABEL: f11:
 define float @f11(i32 %a, i32 %b) nounwind uwtable readnone ssp {
-; CHECK-NOT: floatsisf
+; CHECK-ARM-LABEL: f11:
+; CHECK-ARM:       @ %bb.0:
+; CHECK-ARM-NEXT:    mov r2, #0
+; CHECK-ARM-NEXT:    cmp r0, r1
+; CHECK-ARM-NEXT:    movweq r2, #0
+; CHECK-ARM-NEXT:    movteq r2, #49024
+; CHECK-ARM-NEXT:    mov r0, r2
+; CHECK-ARM-NEXT:    bx lr
+;
+; CHECK-VFP-LABEL: f11:
+; CHECK-VFP:       @ %bb.0:
+; CHECK-VFP-NEXT:    vmov.f32 s2, #-1.000000e+00
+; CHECK-VFP-NEXT:    vldr s0, .LCPI10_0
+; CHECK-VFP-NEXT:    cmp r0, r1
+; CHECK-VFP-NEXT:    vmoveq.f32 s0, s2
+; CHECK-VFP-NEXT:    vmov r0, s0
+; CHECK-VFP-NEXT:    bx lr
+; CHECK-VFP-NEXT:    .p2align 2
+; CHECK-VFP-NEXT:  @ %bb.1:
+; CHECK-VFP-NEXT:  .LCPI10_0:
+; CHECK-VFP-NEXT:    .long 0x00000000 @ float 0
+;
+; CHECK-NEON-LABEL: f11:
+; CHECK-NEON:       @ %bb.0:
+; CHECK-NEON-NEXT:    vldr s0, LCPI10_0
+; CHECK-NEON-NEXT:    vmov.f32 s2, #-1.000000e+00
+; CHECK-NEON-NEXT:    cmp r0, r1
+; CHECK-NEON-NEXT:    it eq
+; CHECK-NEON-NEXT:    vmoveq.f32 s0, s2
+; CHECK-NEON-NEXT:    vmov r0, s0
+; CHECK-NEON-NEXT:    bx lr
+; CHECK-NEON-NEXT:    .p2align 2
+; CHECK-NEON-NEXT:  @ %bb.1:
+; CHECK-NEON-NEXT:    .data_region
+; CHECK-NEON-NEXT:  LCPI10_0:
+; CHECK-NEON-NEXT:    .long 0x00000000 @ float 0
+; CHECK-NEON-NEXT:    .end_data_region
   %1 = icmp eq i32 %a, %b
   %2 = sitofp i1 %1 to float
   ret float %2
 }
 
-; CHECK-LABEL: f12:
 define float @f12(i32 %a, i32 %b) nounwind uwtable readnone ssp {
-; CHECK-NOT: floatunsisf
+; CHECK-ARM-LABEL: f12:
+; CHECK-ARM:       @ %bb.0:
+; CHECK-ARM-NEXT:    mov r2, #0
+; CHECK-ARM-NEXT:    cmp r0, r1
+; CHECK-ARM-NEXT:    moveq r2, #1065353216
+; CHECK-ARM-NEXT:    mov r0, r2
+; CHECK-ARM-NEXT:    bx lr
+;
+; CHECK-VFP-LABEL: f12:
+; CHECK-VFP:       @ %bb.0:
+; CHECK-VFP-NEXT:    vmov.f32 s2, #1.000000e+00
+; CHECK-VFP-NEXT:    vldr s0, .LCPI11_0
+; CHECK-VFP-NEXT:    cmp r0, r1
+; CHECK-VFP-NEXT:    vmoveq.f32 s0, s2
+; CHECK-VFP-NEXT:    vmov r0, s0
+; CHECK-VFP-NEXT:    bx lr
+; CHECK-VFP-NEXT:    .p2align 2
+; CHECK-VFP-NEXT:  @ %bb.1:
+; CHECK-VFP-NEXT:  .LCPI11_0:
+; CHECK-VFP-NEXT:    .long 0x00000000 @ float 0
+;
+; CHECK-NEON-LABEL: f12:
+; CHECK-NEON:       @ %bb.0:
+; CHECK-NEON-NEXT:    vldr s0, LCPI11_0
+; CHECK-NEON-NEXT:    vmov.f32 s2, #1.000000e+00
+; CHECK-NEON-NEXT:    cmp r0, r1
+; CHECK-NEON-NEXT:    it eq
+; CHECK-NEON-NEXT:    vmoveq.f32 s0, s2
+; CHECK-NEON-NEXT:    vmov r0, s0
+; CHECK-NEON-NEXT:    bx lr
+; CHECK-NEON-NEXT:    .p2align 2
+; CHECK-NEON-NEXT:  @ %bb.1:
+; CHECK-NEON-NEXT:    .data_region
+; CHECK-NEON-NEXT:  LCPI11_0:
+; CHECK-NEON-NEXT:    .long 0x00000000 @ float 0
+; CHECK-NEON-NEXT:    .end_data_region
   %1 = icmp eq i32 %a, %b
   %2 = uitofp i1 %1 to float
   ret float %2
 }
 
-; CHECK-LABEL: test_overflow_recombine:
 define i1 @test_overflow_recombine(i32 %in1, i32 %in2) {
-; CHECK: smull [[LO:r[0-9]+]], [[HI:r[0-9]+]]
-; CHECK: subs [[ZERO:r[0-9]+]], [[HI]], [[LO]], asr #31
-; CHECK: movne [[ZERO]], #1
+; CHECK-LABEL: test_overflow_recombine:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mul r2, r0, r1
+; CHECK-NEXT:    smmul r0, r0, r1
+; CHECK-NEXT:    subs r0, r0, r2, asr #31
+; CHECK-NEXT:    movwne r0, #1
+; CHECK-NEXT:    bx lr
+;
+; CHECK-NEON-LABEL: test_overflow_recombine:
+; CHECK-NEON:       @ %bb.0:
+; CHECK-NEON-NEXT:    mul r2, r0, r1
+; CHECK-NEON-NEXT:    smmul r0, r0, r1
+; CHECK-NEON-NEXT:    subs.w r0, r0, r2, asr #31
+; CHECK-NEON-NEXT:    it ne
+; CHECK-NEON-NEXT:    movne r0, #1
+; CHECK-NEON-NEXT:    bx lr
   %prod = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 %in1, i32 %in2)
   %overflow = extractvalue { i32, i1 } %prod, 1
   ret i1 %overflow
diff --git a/llvm/test/CodeGen/RISCV/double-convert.ll b/llvm/test/CodeGen/RISCV/double-convert.ll
index 7a9439e5b322..c1429642962e 100644
--- a/llvm/test/CodeGen/RISCV/double-convert.ll
+++ b/llvm/test/CodeGen/RISCV/double-convert.ll
@@ -1116,13 +1116,7 @@ define i64 @fmv_x_d(double %a, double %b) nounwind {
 ;
 ; RV32IZFINXZDINX-LABEL: fmv_x_d:
 ; RV32IZFINXZDINX:       # %bb.0:
-; RV32IZFINXZDINX-NEXT:    addi sp, sp, -16
 ; RV32IZFINXZDINX-NEXT:    fadd.d a0, a0, a2
-; RV32IZFINXZDINX-NEXT:    sw a0, 8(sp)
-; RV32IZFINXZDINX-NEXT:    sw a1, 12(sp)
-; RV32IZFINXZDINX-NEXT:    lw a0, 8(sp)
-; RV32IZFINXZDINX-NEXT:    lw a1, 12(sp)
-; RV32IZFINXZDINX-NEXT:    addi sp, sp, 16
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: fmv_x_d:
@@ -1257,13 +1251,13 @@ define double @fmv_d_x(i64 %a, i64 %b) nounwind {
 ; RV32IFD-LABEL: fmv_d_x:
 ; RV32IFD:       # %bb.0:
 ; RV32IFD-NEXT:    addi sp, sp, -16
-; RV32IFD-NEXT:    sw a3, 4(sp)
-; RV32IFD-NEXT:    sw a2, 0(sp)
-; RV32IFD-NEXT:    sw a1, 12(sp)
 ; RV32IFD-NEXT:    sw a0, 8(sp)
-; RV32IFD-NEXT:    fld fa5, 0(sp)
+; RV32IFD-NEXT:    sw a1, 12(sp)
+; RV32IFD-NEXT:    fld fa5, 8(sp)
+; RV32IFD-NEXT:    sw a2, 8(sp)
+; RV32IFD-NEXT:    sw a3, 12(sp)
 ; RV32IFD-NEXT:    fld fa4, 8(sp)
-; RV32IFD-NEXT:    fadd.d fa0, fa4, fa5
+; RV32IFD-NEXT:    fadd.d fa0, fa5, fa4
 ; RV32IFD-NEXT:    addi sp, sp, 16
 ; RV32IFD-NEXT:    ret
 ;
@@ -1276,17 +1270,7 @@ define double @fmv_d_x(i64 %a, i64 %b) nounwind {
 ;
 ; RV32IZFINXZDINX-LABEL: fmv_d_x:
 ; RV32IZFINXZDINX:       # %bb.0:
-; RV32IZFINXZDINX-NEXT:    addi sp, sp, -16
-; RV32IZFINXZDINX-NEXT:    sw a3, 4(sp)
-; RV32IZFINXZDINX-NEXT:    sw a2, 0(sp)
-; RV32IZFINXZDINX-NEXT:    sw a1, 12(sp)
-; RV32IZFINXZDINX-NEXT:    sw a0, 8(sp)
-; RV32IZFINXZDINX-NEXT:    lw a0, 0(sp)
-; RV32IZFINXZDINX-NEXT:    lw a1, 4(sp)
-; RV32IZFINXZDINX-NEXT:    lw a2, 8(sp)
-; RV32IZFINXZDINX-NEXT:    lw a3, 12(sp)
-; RV32IZFINXZDINX-NEXT:    fadd.d a0, a2, a0
-; RV32IZFINXZDINX-NEXT:    addi sp, sp, 16
+; RV32IZFINXZDINX-NEXT:    fadd.d a0, a0, a2
 ; RV32IZFINXZDINX-NEXT:    ret
 ;
 ; RV64IZFINXZDINX-LABEL: fmv_d_x:
diff --git a/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll b/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll
index 71769a800c06..c480ba800c69 100644
--- a/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll
+++ b/llvm/test/CodeGen/RISCV/inline-asm-d-constraint-f.ll
@@ -75,24 +75,10 @@ define double @constraint_f_double_abi_name(double %a) nounwind {
 define double @constraint_gpr(double %x) {
 ; RV32F-LABEL: constraint_gpr:
 ; RV32F:       # %bb.0:
-; RV32F-NEXT:    addi sp, sp, -32
-; RV32F-NEXT:    .cfi_def_cfa_offset 32
-; RV32F-NEXT:    sw a0, 8(sp)
-; RV32F-NEXT:    sw a1, 12(sp)
-; RV32F-NEXT:    fld fa5, 8(sp)
-; RV32F-NEXT:    fsd fa5, 24(sp)
-; RV32F-NEXT:    lw a0, 24(sp)
-; RV32F-NEXT:    lw a1, 28(sp)
+; RV32F-NEXT:    .cfi_def_cfa_offset 0
 ; RV32F-NEXT:    #APP
 ; RV32F-NEXT:    mv a0, a0
 ; RV32F-NEXT:    #NO_APP
-; RV32F-NEXT:    sw a1, 20(sp)
-; RV32F-NEXT:    sw a0, 16(sp)
-; RV32F-NEXT:    fld fa5, 16(sp)
-; RV32F-NEXT:    fsd fa5, 8(sp)
-; RV32F-NEXT:    lw a0, 8(sp)
-; RV32F-NEXT:    lw a1, 12(sp)
-; RV32F-NEXT:    addi sp, sp, 32
 ; RV32F-NEXT:    ret
 ;
 ; RV64F-LABEL: constraint_gpr:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll
index b7afee754f68..5252eb71c383 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll
@@ -416,8 +416,8 @@ define double @bitcast_v1i64_f64(<1 x i64> %a) {
 ; RV32ELEN32:       # %bb.0:
 ; RV32ELEN32-NEXT:    addi sp, sp, -16
 ; RV32ELEN32-NEXT:    .cfi_def_cfa_offset 16
-; RV32ELEN32-NEXT:    sw a1, 12(sp)
 ; RV32ELEN32-NEXT:    sw a0, 8(sp)
+; RV32ELEN32-NEXT:    sw a1, 12(sp)
 ; RV32ELEN32-NEXT:    fld fa0, 8(sp)
 ; RV32ELEN32-NEXT:    addi sp, sp, 16
 ; RV32ELEN32-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/spill-fill-fold.ll b/llvm/test/CodeGen/RISCV/spill-fill-fold.ll
index a9a0cc5cf94d..8cf5f55ad5c5 100644
--- a/llvm/test/CodeGen/RISCV/spill-fill-fold.ll
+++ b/llvm/test/CodeGen/RISCV/spill-fill-fold.ll
@@ -290,8 +290,8 @@ define double @spill_i64_to_double(i64 %a) nounwind {
 ; RV32ID-NEXT:    fsd fs9, 40(sp) # 8-byte Folded Spill
 ; RV32ID-NEXT:    fsd fs10, 32(sp) # 8-byte Folded Spill
 ; RV32ID-NEXT:    fsd fs11, 24(sp) # 8-byte Folded Spill
-; RV32ID-NEXT:    sw a1, 20(sp)
 ; RV32ID-NEXT:    sw a0, 16(sp)
+; RV32ID-NEXT:    sw a1, 20(sp)
 ; RV32ID-NEXT:    fld fa5, 16(sp)
 ; RV32ID-NEXT:    fsd fa5, 8(sp) # 8-byte Folded Spill
 ; RV32ID-NEXT:    #APP
@@ -804,13 +804,15 @@ define double @fill_i64_to_double(i64 %a) nounwind {
 ; RV32ID-NEXT:    fsd fs9, 40(sp) # 8-byte Folded Spill
 ; RV32ID-NEXT:    fsd fs10, 32(sp) # 8-byte Folded Spill
 ; RV32ID-NEXT:    fsd fs11, 24(sp) # 8-byte Folded Spill
-; RV32ID-NEXT:    sw a1, 20(sp)
-; RV32ID-NEXT:    sw a0, 16(sp)
-; RV32ID-NEXT:    fld fa5, 16(sp)
-; RV32ID-NEXT:    fsd fa5, 8(sp) # 8-byte Folded Spill
+; RV32ID-NEXT:    sw a1, 12(sp) # 4-byte Folded Spill
+; RV32ID-NEXT:    sw a0, 8(sp) # 4-byte Folded Spill
 ; RV32ID-NEXT:    #APP
 ; RV32ID-NEXT:    #NO_APP
-; RV32ID-NEXT:    fld fa0, 8(sp) # 8-byte Folded Reload
+; RV32ID-NEXT:    lw a0, 8(sp) # 4-byte Folded Reload
+; RV32ID-NEXT:    sw a0, 16(sp)
+; RV32ID-NEXT:    lw a0, 12(sp) # 4-byte Folded Reload
+; RV32ID-NEXT:    sw a0, 20(sp)
+; RV32ID-NEXT:    fld fa0, 16(sp)
 ; RV32ID-NEXT:    lw ra, 172(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    lw s0, 168(sp) # 4-byte Folded Reload
 ; RV32ID-NEXT:    lw s1, 164(sp) # 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/zdinx-large-spill.mir b/llvm/test/CodeGen/RISCV/zdinx-large-spill.mir
new file mode 100644
index 000000000000..a2a722a7f728
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/zdinx-large-spill.mir
@@ -0,0 +1,74 @@
+# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+# RUN: llc %s -mtriple=riscv32 -mattr=+zdinx -start-before=prologepilog -o - | FileCheck %s
+
+# We want to make sure eliminateFrameIndex doesn't fold sp+2044 as an offset in
+# a GPR pair spill/reload instruction. When we split the pair spill, we would be
+# unable to add 4 to the immediate without overflowing simm12.
+
+--- |
+  define void @foo() {
+  ; CHECK-LABEL: foo:
+  ; CHECK:       # %bb.0:
+  ; CHECK-NEXT:    addi sp, sp, -2048
+  ; CHECK-NEXT:    addi sp, sp, -16
+  ; CHECK-NEXT:    .cfi_def_cfa_offset 2064
+  ; CHECK-NEXT:    lui t0, 1
+  ; CHECK-NEXT:    add t0, sp, t0
+  ; CHECK-NEXT:    sw a0, -2040(t0)
+  ; CHECK-NEXT:    sw a1, -2036(t0)
+  ; CHECK-NEXT:    lui a0, 1
+  ; CHECK-NEXT:    add a0, sp, a0
+  ; CHECK-NEXT:    sw a2, -2048(a0)
+  ; CHECK-NEXT:    sw a3, -2044(a0)
+  ; CHECK-NEXT:    sw a4, 2040(sp)
+  ; CHECK-NEXT:    sw a5, 2044(sp)
+  ; CHECK-NEXT:    sw a6, 2032(sp)
+  ; CHECK-NEXT:    sw a7, 2036(sp)
+  ; CHECK-NEXT:    lui a0, 1
+  ; CHECK-NEXT:    add a0, sp, a0
+  ; CHECK-NEXT:    lw a1, -2036(a0)
+  ; CHECK-NEXT:    lw a0, -2040(a0)
+  ; CHECK-NEXT:    lui a0, 1
+  ; CHECK-NEXT:    add a0, sp, a0
+  ; CHECK-NEXT:    lw a2, -2048(a0)
+  ; CHECK-NEXT:    lw a3, -2044(a0)
+  ; CHECK-NEXT:    lw a4, 2040(sp)
+  ; CHECK-NEXT:    lw a5, 2044(sp)
+  ; CHECK-NEXT:    lw a6, 2032(sp)
+  ; CHECK-NEXT:    lw a7, 2036(sp)
+  ; CHECK-NEXT:    addi sp, sp, 2032
+  ; CHECK-NEXT:    addi sp, sp, 32
+  ; CHECK-NEXT:    ret
+    ret void
+  }
+...
+---
+name:            foo
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    4
+stack:
+  - { id: 0, type: spill-slot, size: 8, alignment: 4 }
+  - { id: 1, type: spill-slot, size: 8, alignment: 4 }
+  - { id: 2, type: spill-slot, size: 8, alignment: 4 }
+  - { id: 3, type: spill-slot, size: 8, alignment: 4 }
+  - { id: 4, type: spill-slot, size: 2024, alignment: 4 }
+machineFunctionInfo:
+  varArgsFrameIndex: 0
+  varArgsSaveSize: 0
+body:             |
+  bb.0:
+    liveins: $x10_x11, $x12_x13, $x14_x15, $x16_x17
+
+    PseudoRV32ZdinxSD killed renamable $x10_x11, %stack.0, 0 :: (store (s64) into %stack.0, align 4)
+    PseudoRV32ZdinxSD killed renamable $x12_x13, %stack.1, 0 :: (store (s64) into %stack.1, align 4)
+    PseudoRV32ZdinxSD killed renamable $x14_x15, %stack.2, 0 :: (store (s64) into %stack.2, align 4)
+    PseudoRV32ZdinxSD killed renamable $x16_x17, %stack.3, 0 :: (store (s64) into %stack.3, align 4)
+    renamable $x10_x11 = PseudoRV32ZdinxLD %stack.0, 0 :: (load (s64) from %stack.0, align 4)
+    renamable $x12_x13 = PseudoRV32ZdinxLD %stack.1, 0 :: (load (s64) from %stack.1, align 4)
+    renamable $x14_x15 = PseudoRV32ZdinxLD %stack.2, 0 :: (load (s64) from %stack.2, align 4)
+    renamable $x16_x17 = PseudoRV32ZdinxLD %stack.3, 0 :: (load (s64) from %stack.3, align 4)
+    PseudoRET
+
+...
diff --git a/llvm/test/CodeGen/SystemZ/frame-adjstack.ll b/llvm/test/CodeGen/SystemZ/frame-adjstack.ll
new file mode 100644
index 000000000000..7edacaa3d7d7
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/frame-adjstack.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs | FileCheck %s
+;
+; Test that inserting a new MBB near a call during finalize isel custom
+; insertion does not cause all frame instructions to be missed. That would
+; result in a missing to set the AdjustsStack flag.
+
+; CHECK-LABEL: fun
+define void @fun(i1 %cc) {
+  %sel = select i1 %cc, i32 5, i32 0
+  tail call void @input_report_abs(i32 %sel)
+  %sel2 = select i1 %cc, i32 6, i32 1
+  tail call void @input_report_abs(i32 %sel2)
+  ret void
+}
+
+declare void @input_report_abs(i32)
diff --git a/llvm/test/CodeGen/X86/apx/domain-reassignment.mir b/llvm/test/CodeGen/X86/apx/domain-reassignment.mir
new file mode 100644
index 000000000000..dcd435619990
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/domain-reassignment.mir
@@ -0,0 +1,929 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -run-pass x86-domain-reassignment -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq -o - %s | FileCheck %s
+--- |
+  ; ModuleID = '../test/CodeGen/X86/gpr-to-mask.ll'
+  source_filename = "../test/CodeGen/X86/gpr-to-mask.ll"
+  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-unknown"
+
+  define void @test_fcmp_storefloat(i1 %cond, ptr %fptr, float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) #0 {
+  entry:
+    br i1 %cond, label %if, label %else
+
+  if:                                               ; preds = %entry
+    %cmp1 = fcmp oeq float %f3, %f4
+    br label %exit
+
+  else:                                             ; preds = %entry
+    %cmp2 = fcmp oeq float %f5, %f6
+    br label %exit
+
+  exit:                                             ; preds = %else, %if
+    %val = phi i1 [ %cmp1, %if ], [ %cmp2, %else ]
+    %selected = select i1 %val, float %f1, float %f2
+    store float %selected, ptr %fptr
+    ret void
+  }
+
+  define void @test_8bitops() #0 {
+    ret void
+  }
+  define void @test_16bitops() #0 {
+    ret void
+  }
+  define void @test_32bitops() #0 {
+    ret void
+  }
+  define void @test_64bitops() #0 {
+    ret void
+  }
+  define void @test_16bitext() #0 {
+    ret void
+  }
+  define void @test_32bitext() #0 {
+    ret void
+  }
+  define void @test_64bitext() #0 {
+    ret void
+  }
+  ; Note that this function need to be compiled with -global-isel
+  ; to obtain testable MIR
+  define void @test_unused(i64 %0) #0 {
+    %unused = lshr i64 %0, 7
+    ret void
+  }
+...
+---
+name:            test_fcmp_storefloat
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr8, preferred-register: '' }
+  - { id: 1, class: gr8, preferred-register: '' }
+  - { id: 2, class: gr8, preferred-register: '' }
+  - { id: 3, class: gr32, preferred-register: '' }
+  - { id: 4, class: gr64, preferred-register: '' }
+  - { id: 5, class: vr128x, preferred-register: '' }
+  - { id: 6, class: fr32x, preferred-register: '' }
+  - { id: 7, class: fr32x, preferred-register: '' }
+  - { id: 8, class: fr32x, preferred-register: '' }
+  - { id: 9, class: fr32x, preferred-register: '' }
+  - { id: 10, class: fr32x, preferred-register: '' }
+  - { id: 11, class: gr8, preferred-register: '' }
+  - { id: 12, class: vk1, preferred-register: '' }
+  - { id: 13, class: gr32, preferred-register: '' }
+  - { id: 14, class: vk1, preferred-register: '' }
+  - { id: 15, class: gr32, preferred-register: '' }
+  - { id: 16, class: gr32, preferred-register: '' }
+  - { id: 17, class: gr32, preferred-register: '' }
+  - { id: 18, class: vk1wm, preferred-register: '' }
+  - { id: 19, class: vr128x, preferred-register: '' }
+  - { id: 20, class: vr128, preferred-register: '' }
+  - { id: 21, class: vr128, preferred-register: '' }
+  - { id: 22, class: fr32x, preferred-register: '' }
+liveins:
+  - { reg: '$edi', virtual-reg: '%3' }
+  - { reg: '$rsi', virtual-reg: '%4' }
+  - { reg: '$xmm0', virtual-reg: '%5' }
+  - { reg: '$xmm1', virtual-reg: '%6' }
+  - { reg: '$xmm2', virtual-reg: '%7' }
+  - { reg: '$xmm3', virtual-reg: '%8' }
+  - { reg: '$xmm4', virtual-reg: '%9' }
+  - { reg: '$xmm5', virtual-reg: '%10' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 4294967295
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  ; CHECK-LABEL: name: test_fcmp_storefloat
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK:   liveins: $edi, $rsi, $xmm0, $xmm1, $xmm2, $xmm3, $xmm4, $xmm5
+  ; CHECK:   [[COPY:%[0-9]+]]:fr32x = COPY $xmm5
+  ; CHECK:   [[COPY1:%[0-9]+]]:fr32x = COPY $xmm4
+  ; CHECK:   [[COPY2:%[0-9]+]]:fr32x = COPY $xmm3
+  ; CHECK:   [[COPY3:%[0-9]+]]:fr32x = COPY $xmm2
+  ; CHECK:   [[COPY4:%[0-9]+]]:fr32x = COPY $xmm1
+  ; CHECK:   [[COPY5:%[0-9]+]]:vr128x = COPY $xmm0
+  ; CHECK:   [[COPY6:%[0-9]+]]:gr64 = COPY $rsi
+  ; CHECK:   [[COPY7:%[0-9]+]]:gr32 = COPY $edi
+  ; CHECK:   [[COPY8:%[0-9]+]]:gr8 = COPY [[COPY7]].sub_8bit
+  ; CHECK:   TEST8ri killed [[COPY8]], 1, implicit-def $eflags
+  ; CHECK:   JCC_1 %bb.2, 4, implicit $eflags
+  ; CHECK:   JMP_1 %bb.1
+  ; CHECK: bb.1.if:
+  ; CHECK:   successors: %bb.3(0x80000000)
+  ; CHECK:   [[VCMPSSZrri:%[0-9]+]]:vk1 = VCMPSSZrri [[COPY3]], [[COPY2]], 0
+  ; CHECK:   [[COPY9:%[0-9]+]]:vk32 = COPY [[VCMPSSZrri]]
+  ; CHECK:   [[COPY10:%[0-9]+]]:vk8 = COPY [[COPY9]]
+  ; CHECK:   JMP_1 %bb.3
+  ; CHECK: bb.2.else:
+  ; CHECK:   successors: %bb.3(0x80000000)
+  ; CHECK:   [[VCMPSSZrri1:%[0-9]+]]:vk1 = VCMPSSZrri [[COPY1]], [[COPY]], 0
+  ; CHECK:   [[COPY11:%[0-9]+]]:vk32 = COPY [[VCMPSSZrri1]]
+  ; CHECK:   [[COPY12:%[0-9]+]]:vk8 = COPY [[COPY11]]
+  ; CHECK: bb.3.exit:
+  ; CHECK:   [[PHI:%[0-9]+]]:vk8 = PHI [[COPY12]], %bb.2, [[COPY10]], %bb.1
+  ; CHECK:   [[DEF:%[0-9]+]]:vk32 = IMPLICIT_DEF
+  ; CHECK:   [[COPY13:%[0-9]+]]:vk32 = COPY [[PHI]]
+  ; CHECK:   [[COPY14:%[0-9]+]]:vk1wm = COPY [[COPY13]]
+  ; CHECK:   [[COPY15:%[0-9]+]]:vr128x = COPY [[COPY4]]
+  ; CHECK:   [[DEF1:%[0-9]+]]:vr128 = IMPLICIT_DEF
+  ; CHECK:   [[VMOVSSZrrk:%[0-9]+]]:vr128 = VMOVSSZrrk [[COPY15]], killed [[COPY14]], killed [[DEF1]], [[COPY5]]
+  ; CHECK:   [[COPY16:%[0-9]+]]:fr32x = COPY [[VMOVSSZrrk]]
+  ; CHECK:   VMOVSSZmr [[COPY6]], 1, $noreg, 0, $noreg, killed [[COPY16]] :: (store (s32) into %ir.fptr)
+  ; CHECK:   RET 0
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $edi, $rsi, $xmm0, $xmm1, $xmm2, $xmm3, $xmm4, $xmm5
+
+    %10 = COPY $xmm5
+    %9 = COPY $xmm4
+    %8 = COPY $xmm3
+    %7 = COPY $xmm2
+    %6 = COPY $xmm1
+    %5 = COPY $xmm0
+    %4 = COPY $rsi
+    %3 = COPY $edi
+    %11 = COPY %3.sub_8bit
+    TEST8ri killed %11, 1, implicit-def $eflags
+    JCC_1 %bb.2, 4, implicit $eflags
+    JMP_1 %bb.1
+
+  bb.1.if:
+    successors: %bb.3(0x80000000)
+
+    %14 = VCMPSSZrri %7, %8, 0, implicit $mxcsr
+
+    ; check that cross domain copies are replaced with same domain copies.
+
+    %15 = COPY %14
+    %0 = COPY %15.sub_8bit
+    JMP_1 %bb.3
+
+  bb.2.else:
+    successors: %bb.3(0x80000000)
+    %12 = VCMPSSZrri %9, %10, 0, implicit $mxcsr
+
+    ; check that cross domain copies are replaced with same domain copies.
+
+    %13 = COPY %12
+    %1 = COPY %13.sub_8bit
+
+  bb.3.exit:
+
+    ; check PHI, IMPLICIT_DEF, and INSERT_SUBREG replacers.
+
+    %2 = PHI %1, %bb.2, %0, %bb.1
+    %17 = IMPLICIT_DEF
+    %16 = INSERT_SUBREG %17, %2, %subreg.sub_8bit_hi
+    %18 = COPY %16
+    %19 = COPY %6
+    %21 = IMPLICIT_DEF
+    %20 = VMOVSSZrrk %19, killed %18, killed %21, %5
+    %22 = COPY %20
+    VMOVSSZmr %4, 1, $noreg, 0, $noreg, killed %22 :: (store (s32) into %ir.fptr)
+    RET 0
+
+...
+---
+name:            test_8bitops
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr64, preferred-register: '' }
+  - { id: 1, class: vr512, preferred-register: '' }
+  - { id: 2, class: vr512, preferred-register: '' }
+  - { id: 3, class: vr512, preferred-register: '' }
+  - { id: 4, class: vr512, preferred-register: '' }
+  - { id: 5, class: vk8, preferred-register: '' }
+  - { id: 6, class: gr32, preferred-register: '' }
+  - { id: 7, class: gr8, preferred-register: '' }
+  - { id: 8, class: gr32, preferred-register: '' }
+  - { id: 9, class: gr32, preferred-register: '' }
+  - { id: 10, class: vk8wm, preferred-register: '' }
+  - { id: 11, class: vr512, preferred-register: '' }
+  - { id: 12, class: gr8, preferred-register: '' }
+  - { id: 13, class: gr8, preferred-register: '' }
+  - { id: 14, class: gr8, preferred-register: '' }
+  - { id: 15, class: gr8, preferred-register: '' }
+  - { id: 16, class: gr8, preferred-register: '' }
+  - { id: 17, class: gr8, preferred-register: '' }
+  - { id: 18, class: gr8, preferred-register: '' }
+liveins:
+  - { reg: '$rdi', virtual-reg: '%0' }
+  - { reg: '$zmm0', virtual-reg: '%1' }
+  - { reg: '$zmm1', virtual-reg: '%2' }
+  - { reg: '$zmm2', virtual-reg: '%3' }
+  - { reg: '$zmm3', virtual-reg: '%4' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 4294967295
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  ; CHECK-LABEL: name: test_8bitops
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $rdi, $zmm0, $zmm1, $zmm2, $zmm3
+  ; CHECK:   [[COPY:%[0-9]+]]:gr64 = COPY $rdi
+  ; CHECK:   [[COPY1:%[0-9]+]]:vr512 = COPY $zmm0
+  ; CHECK:   [[COPY2:%[0-9]+]]:vr512 = COPY $zmm1
+  ; CHECK:   [[COPY3:%[0-9]+]]:vr512 = COPY $zmm2
+  ; CHECK:   [[COPY4:%[0-9]+]]:vr512 = COPY $zmm3
+  ; CHECK:   [[VCMPPDZrri:%[0-9]+]]:vk8 = VCMPPDZrri [[COPY3]], [[COPY4]], 0
+  ; CHECK:   [[COPY5:%[0-9]+]]:vk32 = COPY [[VCMPPDZrri]]
+  ; CHECK:   [[COPY6:%[0-9]+]]:vk8 = COPY [[COPY5]]
+  ; CHECK:   [[KSHIFTRBri:%[0-9]+]]:vk8 = KSHIFTRBri [[COPY6]], 2
+  ; CHECK:   [[KSHIFTLBri:%[0-9]+]]:vk8 = KSHIFTLBri [[KSHIFTRBri]], 1
+  ; CHECK:   [[KNOTBrr:%[0-9]+]]:vk8 = KNOTBrr [[KSHIFTLBri]]
+  ; CHECK:   [[KORBrr:%[0-9]+]]:vk8 = KORBrr [[KNOTBrr]], [[KSHIFTRBri]]
+  ; CHECK:   [[KANDBrr:%[0-9]+]]:vk8 = KANDBrr [[KORBrr]], [[KSHIFTLBri]]
+  ; CHECK:   [[KXORBrr:%[0-9]+]]:vk8 = KXORBrr [[KANDBrr]], [[KSHIFTRBri]]
+  ; CHECK:   [[KADDBrr:%[0-9]+]]:vk8 = KADDBrr [[KXORBrr]], [[KNOTBrr]]
+  ; CHECK:   [[DEF:%[0-9]+]]:vk32 = IMPLICIT_DEF
+  ; CHECK:   [[COPY7:%[0-9]+]]:vk32 = COPY [[KADDBrr]]
+  ; CHECK:   [[COPY8:%[0-9]+]]:vk8wm = COPY [[COPY7]]
+  ; CHECK:   [[VMOVAPDZrrk:%[0-9]+]]:vr512 = VMOVAPDZrrk [[COPY2]], killed [[COPY8]], [[COPY1]]
+  ; CHECK:   VMOVAPDZmr [[COPY]], 1, $noreg, 0, $noreg, killed [[VMOVAPDZrrk]]
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK: bb.2:
+  ; CHECK:   RET 0
+  bb.0:
+    liveins: $rdi, $zmm0, $zmm1, $zmm2, $zmm3
+
+    %0 = COPY $rdi
+    %1 = COPY $zmm0
+    %2 = COPY $zmm1
+    %3 = COPY $zmm2
+    %4 = COPY $zmm3
+
+    %5 = VCMPPDZrri %3, %4, 0, implicit $mxcsr
+    %6 = COPY %5
+    %7 = COPY %6.sub_8bit
+
+    %12 = SHR8ri %7, 2, implicit-def dead $eflags
+    %13 = SHL8ri %12, 1, implicit-def dead $eflags
+    %14 = NOT8r %13
+    %15 = OR8rr %14, %12, implicit-def dead $eflags
+    %16 = AND8rr %15, %13, implicit-def dead $eflags
+    %17 = XOR8rr %16, %12, implicit-def dead $eflags
+    %18 = ADD8rr %17, %14, implicit-def dead $eflags
+
+    %8 = IMPLICIT_DEF
+    %9 = INSERT_SUBREG %8, %18, %subreg.sub_8bit_hi
+    %10 = COPY %9
+    %11 = VMOVAPDZrrk %2, killed %10, %1
+    VMOVAPDZmr %0, 1, $noreg, 0, $noreg, killed %11
+
+    ; FIXME We can't replace TEST with KTEST due to flag differences
+    ; TEST8rr %18, %18, implicit-def $eflags
+    ; JCC_1 %bb.1, 4, implicit $eflags
+    ; JMP_1 %bb.2
+
+  bb.1:
+
+  bb.2:
+    RET 0
+
+...
+---
+name:            test_16bitops
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr64, preferred-register: '' }
+  - { id: 1, class: vr512, preferred-register: '' }
+  - { id: 2, class: vr512, preferred-register: '' }
+  - { id: 3, class: vr512, preferred-register: '' }
+  - { id: 4, class: vr512, preferred-register: '' }
+  - { id: 5, class: vk16, preferred-register: '' }
+  - { id: 6, class: gr32, preferred-register: '' }
+  - { id: 7, class: gr16, preferred-register: '' }
+  - { id: 8, class: gr32, preferred-register: '' }
+  - { id: 9, class: gr32, preferred-register: '' }
+  - { id: 10, class: vk16wm, preferred-register: '' }
+  - { id: 11, class: vr512, preferred-register: '' }
+  - { id: 12, class: gr16, preferred-register: '' }
+  - { id: 13, class: gr16, preferred-register: '' }
+  - { id: 14, class: gr16, preferred-register: '' }
+  - { id: 15, class: gr16, preferred-register: '' }
+  - { id: 16, class: gr16, preferred-register: '' }
+  - { id: 17, class: gr16, preferred-register: '' }
+liveins:
+  - { reg: '$rdi', virtual-reg: '%0' }
+  - { reg: '$zmm0', virtual-reg: '%1' }
+  - { reg: '$zmm1', virtual-reg: '%2' }
+  - { reg: '$zmm2', virtual-reg: '%3' }
+  - { reg: '$zmm3', virtual-reg: '%4' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 4294967295
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  ; CHECK-LABEL: name: test_16bitops
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $rdi, $zmm0, $zmm1, $zmm2, $zmm3
+  ; CHECK:   [[COPY:%[0-9]+]]:gr64 = COPY $rdi
+  ; CHECK:   [[COPY1:%[0-9]+]]:vr512 = COPY $zmm0
+  ; CHECK:   [[COPY2:%[0-9]+]]:vr512 = COPY $zmm1
+  ; CHECK:   [[COPY3:%[0-9]+]]:vr512 = COPY $zmm2
+  ; CHECK:   [[COPY4:%[0-9]+]]:vr512 = COPY $zmm3
+  ; CHECK:   [[VCMPPSZrri:%[0-9]+]]:vk16 = VCMPPSZrri [[COPY3]], [[COPY4]], 0
+  ; CHECK:   [[COPY5:%[0-9]+]]:vk32 = COPY [[VCMPPSZrri]]
+  ; CHECK:   [[COPY6:%[0-9]+]]:vk16 = COPY [[COPY5]]
+  ; CHECK:   [[KSHIFTRWri:%[0-9]+]]:vk16 = KSHIFTRWri [[COPY6]], 2
+  ; CHECK:   [[KSHIFTLWri:%[0-9]+]]:vk16 = KSHIFTLWri [[KSHIFTRWri]], 1
+  ; CHECK:   [[KNOTWrr:%[0-9]+]]:vk16 = KNOTWrr [[KSHIFTLWri]]
+  ; CHECK:   [[KORWrr:%[0-9]+]]:vk16 = KORWrr [[KNOTWrr]], [[KSHIFTRWri]]
+  ; CHECK:   [[KANDWrr:%[0-9]+]]:vk16 = KANDWrr [[KORWrr]], [[KSHIFTLWri]]
+  ; CHECK:   [[KXORWrr:%[0-9]+]]:vk16 = KXORWrr [[KANDWrr]], [[KSHIFTRWri]]
+  ; CHECK:   [[DEF:%[0-9]+]]:vk32 = IMPLICIT_DEF
+  ; CHECK:   [[COPY7:%[0-9]+]]:vk32 = COPY [[KXORWrr]]
+  ; CHECK:   [[COPY8:%[0-9]+]]:vk16wm = COPY [[COPY7]]
+  ; CHECK:   [[VMOVAPSZrrk:%[0-9]+]]:vr512 = VMOVAPSZrrk [[COPY2]], killed [[COPY8]], [[COPY1]]
+  ; CHECK:   VMOVAPSZmr [[COPY]], 1, $noreg, 0, $noreg, killed [[VMOVAPSZrrk]]
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK: bb.2:
+  ; CHECK:   RET 0
+  bb.0:
+    liveins: $rdi, $zmm0, $zmm1, $zmm2, $zmm3
+
+    %0 = COPY $rdi
+    %1 = COPY $zmm0
+    %2 = COPY $zmm1
+    %3 = COPY $zmm2
+    %4 = COPY $zmm3
+
+    %5 = VCMPPSZrri %3, %4, 0, implicit $mxcsr
+    %6 = COPY %5
+    %7 = COPY %6.sub_16bit
+
+    %12 = SHR16ri %7, 2, implicit-def dead $eflags
+    %13 = SHL16ri %12, 1, implicit-def dead $eflags
+    %14 = NOT16r %13
+    %15 = OR16rr %14, %12, implicit-def dead $eflags
+    %16 = AND16rr %15, %13, implicit-def dead $eflags
+    %17 = XOR16rr %16, %12, implicit-def dead $eflags
+
+    %8 = IMPLICIT_DEF
+    %9 = INSERT_SUBREG %8, %17, %subreg.sub_16bit
+    %10 = COPY %9
+    %11 = VMOVAPSZrrk %2, killed %10, %1
+    VMOVAPSZmr %0, 1, $noreg, 0, $noreg, killed %11
+
+    ; FIXME We can't replace TEST with KTEST due to flag differences
+    ; FIXME TEST16rr %17, %17, implicit-def $eflags
+    ; FIXME JCC_1 %bb.1, 4, implicit $eflags
+    ; FIXME JMP_1 %bb.2
+
+  bb.1:
+
+  bb.2:
+    RET 0
+
+...
+---
+name:            test_32bitops
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr64, preferred-register: '' }
+  - { id: 1, class: vr512, preferred-register: '' }
+  - { id: 2, class: vr512, preferred-register: '' }
+  - { id: 3, class: vk32wm, preferred-register: '' }
+  - { id: 4, class: vr512, preferred-register: '' }
+  - { id: 5, class: gr32, preferred-register: '' }
+  - { id: 6, class: gr32, preferred-register: '' }
+  - { id: 7, class: gr32, preferred-register: '' }
+  - { id: 8, class: gr32, preferred-register: '' }
+  - { id: 9, class: gr32, preferred-register: '' }
+  - { id: 10, class: gr32, preferred-register: '' }
+  - { id: 11, class: gr32, preferred-register: '' }
+  - { id: 12, class: gr32, preferred-register: '' }
+  - { id: 13, class: gr32, preferred-register: '' }
+liveins:
+  - { reg: '$rdi', virtual-reg: '%0' }
+  - { reg: '$zmm0', virtual-reg: '%1' }
+  - { reg: '$zmm1', virtual-reg: '%2' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 4294967295
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  ; CHECK-LABEL: name: test_32bitops
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $rdi, $zmm0, $zmm1
+  ; CHECK:   [[COPY:%[0-9]+]]:gr64 = COPY $rdi
+  ; CHECK:   [[COPY1:%[0-9]+]]:vr512 = COPY $zmm0
+  ; CHECK:   [[COPY2:%[0-9]+]]:vr512 = COPY $zmm1
+  ; CHECK:   [[KMOVDkm:%[0-9]+]]:vk32 = KMOVDkm [[COPY]], 1, $noreg, 0, $noreg
+  ; CHECK:   [[KSHIFTRDri:%[0-9]+]]:vk32 = KSHIFTRDri [[KMOVDkm]], 2
+  ; CHECK:   [[KSHIFTLDri:%[0-9]+]]:vk32 = KSHIFTLDri [[KSHIFTRDri]], 1
+  ; CHECK:   [[KNOTDrr:%[0-9]+]]:vk32 = KNOTDrr [[KSHIFTLDri]]
+  ; CHECK:   [[KORDrr:%[0-9]+]]:vk32 = KORDrr [[KNOTDrr]], [[KSHIFTRDri]]
+  ; CHECK:   [[KANDDrr:%[0-9]+]]:vk32 = KANDDrr [[KORDrr]], [[KSHIFTLDri]]
+  ; CHECK:   [[KXORDrr:%[0-9]+]]:vk32 = KXORDrr [[KANDDrr]], [[KSHIFTRDri]]
+  ; CHECK:   [[KANDNDrr:%[0-9]+]]:vk32 = KANDNDrr [[KXORDrr]], [[KORDrr]]
+  ; CHECK:   [[KADDDrr:%[0-9]+]]:vk32 = KADDDrr [[KANDNDrr]], [[KXORDrr]]
+  ; CHECK:   [[COPY3:%[0-9]+]]:vk32wm = COPY [[KADDDrr]]
+  ; CHECK:   [[VMOVDQU16Zrrk:%[0-9]+]]:vr512 = VMOVDQU16Zrrk [[COPY2]], killed [[COPY3]], [[COPY1]]
+  ; CHECK:   VMOVDQA32Zmr [[COPY]], 1, $noreg, 0, $noreg, killed [[VMOVDQU16Zrrk]]
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK: bb.2:
+  ; CHECK:   RET 0
+  bb.0:
+    liveins: $rdi, $zmm0, $zmm1
+
+    %0 = COPY $rdi
+    %1 = COPY $zmm0
+    %2 = COPY $zmm1
+
+    %5 = MOV32rm %0, 1, $noreg, 0, $noreg
+    %6 = SHR32ri %5, 2, implicit-def dead $eflags
+    %7 = SHL32ri %6, 1, implicit-def dead $eflags
+    %8 = NOT32r %7
+    %9 = OR32rr %8, %6, implicit-def dead $eflags
+    %10 = AND32rr %9, %7, implicit-def dead $eflags
+    %11 = XOR32rr %10, %6, implicit-def dead $eflags
+    %12 = ANDN32rr %11, %9, implicit-def dead $eflags
+    %13 = ADD32rr %12, %11, implicit-def dead $eflags
+
+    %3 = COPY %13
+    %4 = VMOVDQU16Zrrk %2, killed %3, %1
+    VMOVDQA32Zmr %0, 1, $noreg, 0, $noreg, killed %4
+
+    ; FIXME We can't replace TEST with KTEST due to flag differences
+    ; FIXME TEST32rr %13, %13, implicit-def $eflags
+    ; FIXME JCC_1 %bb.1, 4, implicit $eflags
+    ; FIXME JMP_1 %bb.2
+
+  bb.1:
+
+  bb.2:
+    RET 0
+
+...
+---
+name:            test_64bitops
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr64, preferred-register: '' }
+  - { id: 1, class: vr512, preferred-register: '' }
+  - { id: 2, class: vr512, preferred-register: '' }
+  - { id: 3, class: vk64wm, preferred-register: '' }
+  - { id: 4, class: vr512, preferred-register: '' }
+  - { id: 5, class: gr64, preferred-register: '' }
+  - { id: 6, class: gr64, preferred-register: '' }
+  - { id: 7, class: gr64, preferred-register: '' }
+  - { id: 8, class: gr64, preferred-register: '' }
+  - { id: 9, class: gr64, preferred-register: '' }
+  - { id: 10, class: gr64, preferred-register: '' }
+  - { id: 11, class: gr64, preferred-register: '' }
+  - { id: 12, class: gr64, preferred-register: '' }
+  - { id: 13, class: gr64, preferred-register: '' }
+liveins:
+  - { reg: '$rdi', virtual-reg: '%0' }
+  - { reg: '$zmm0', virtual-reg: '%1' }
+  - { reg: '$zmm1', virtual-reg: '%2' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 4294967295
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  ; CHECK-LABEL: name: test_64bitops
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $rdi, $zmm0, $zmm1
+  ; CHECK:   [[COPY:%[0-9]+]]:gr64 = COPY $rdi
+  ; CHECK:   [[COPY1:%[0-9]+]]:vr512 = COPY $zmm0
+  ; CHECK:   [[COPY2:%[0-9]+]]:vr512 = COPY $zmm1
+  ; CHECK:   [[KMOVQkm:%[0-9]+]]:vk64 = KMOVQkm [[COPY]], 1, $noreg, 0, $noreg
+  ; CHECK:   [[KSHIFTRQri:%[0-9]+]]:vk64 = KSHIFTRQri [[KMOVQkm]], 2
+  ; CHECK:   [[KSHIFTLQri:%[0-9]+]]:vk64 = KSHIFTLQri [[KSHIFTRQri]], 1
+  ; CHECK:   [[KNOTQrr:%[0-9]+]]:vk64 = KNOTQrr [[KSHIFTLQri]]
+  ; CHECK:   [[KORQrr:%[0-9]+]]:vk64 = KORQrr [[KNOTQrr]], [[KSHIFTRQri]]
+  ; CHECK:   [[KANDQrr:%[0-9]+]]:vk64 = KANDQrr [[KORQrr]], [[KSHIFTLQri]]
+  ; CHECK:   [[KXORQrr:%[0-9]+]]:vk64 = KXORQrr [[KANDQrr]], [[KSHIFTRQri]]
+  ; CHECK:   [[KANDNQrr:%[0-9]+]]:vk64 = KANDNQrr [[KXORQrr]], [[KORQrr]]
+  ; CHECK:   [[KADDQrr:%[0-9]+]]:vk64 = KADDQrr [[KANDNQrr]], [[KXORQrr]]
+  ; CHECK:   [[COPY3:%[0-9]+]]:vk64wm = COPY [[KADDQrr]]
+  ; CHECK:   [[VMOVDQU8Zrrk:%[0-9]+]]:vr512 = VMOVDQU8Zrrk [[COPY2]], killed [[COPY3]], [[COPY1]]
+  ; CHECK:   VMOVDQA32Zmr [[COPY]], 1, $noreg, 0, $noreg, killed [[VMOVDQU8Zrrk]]
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK: bb.2:
+  ; CHECK:   RET 0
+  bb.0:
+    liveins: $rdi, $zmm0, $zmm1
+
+    %0 = COPY $rdi
+    %1 = COPY $zmm0
+    %2 = COPY $zmm1
+
+    %5 = MOV64rm %0, 1, $noreg, 0, $noreg
+    %6 = SHR64ri %5, 2, implicit-def dead $eflags
+    %7 = SHL64ri %6, 1, implicit-def dead $eflags
+    %8 = NOT64r %7
+    %9 = OR64rr %8, %6, implicit-def dead $eflags
+    %10 = AND64rr %9, %7, implicit-def dead $eflags
+    %11 = XOR64rr %10, %6, implicit-def dead $eflags
+    %12 = ANDN64rr %11, %9, implicit-def dead $eflags
+    %13 = ADD64rr %12, %11, implicit-def dead $eflags
+
+    %3 = COPY %13
+    %4 = VMOVDQU8Zrrk %2, killed %3, %1
+    VMOVDQA32Zmr %0, 1, $noreg, 0, $noreg, killed %4
+
+    ; FIXME We can't replace TEST with KTEST due to flag differences
+    ; FIXME TEST64rr %13, %13, implicit-def $eflags
+    ; FIXME JCC_1 %bb.1, 4, implicit $eflags
+    ; FIXME JMP_1 %bb.2
+
+  bb.1:
+
+  bb.2:
+    RET 0
+
+...
+---
+name:            test_16bitext
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr64, preferred-register: '' }
+  - { id: 1, class: vr512, preferred-register: '' }
+  - { id: 2, class: vr512, preferred-register: '' }
+  - { id: 3, class: vk16wm, preferred-register: '' }
+  - { id: 4, class: vr512, preferred-register: '' }
+  - { id: 5, class: gr16, preferred-register: '' }
+  - { id: 6, class: gr16, preferred-register: '' }
+liveins:
+  - { reg: '$rdi', virtual-reg: '%0' }
+  - { reg: '$zmm0', virtual-reg: '%1' }
+  - { reg: '$zmm1', virtual-reg: '%2' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 4294967295
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0:
+    liveins: $rdi, $zmm0, $zmm1
+
+    ; CHECK-LABEL: name: test_16bitext
+    ; CHECK: liveins: $rdi, $zmm0, $zmm1
+    ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY $rdi
+    ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY $zmm0
+    ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY $zmm1
+    ; CHECK: [[KMOVBkm:%[0-9]+]]:vk8 = KMOVBkm [[COPY]], 1, $noreg, 0, $noreg
+    ; CHECK: [[COPY3:%[0-9]+]]:vk16 = COPY [[KMOVBkm]]
+    ; CHECK: [[KNOTWrr:%[0-9]+]]:vk16 = KNOTWrr [[COPY3]]
+    ; CHECK: [[COPY4:%[0-9]+]]:vk16wm = COPY [[KNOTWrr]]
+    ; CHECK: [[VMOVAPSZrrk:%[0-9]+]]:vr512 = VMOVAPSZrrk [[COPY2]], killed [[COPY4]], [[COPY1]]
+    ; CHECK: VMOVAPSZmr [[COPY]], 1, $noreg, 0, $noreg, killed [[VMOVAPSZrrk]]
+    ; CHECK: RET 0
+    %0 = COPY $rdi
+    %1 = COPY $zmm0
+    %2 = COPY $zmm1
+
+    %5 = MOVZX16rm8 %0, 1, $noreg, 0, $noreg
+    %6 = NOT16r %5
+
+    %3 = COPY %6
+    %4 = VMOVAPSZrrk %2, killed %3, %1
+    VMOVAPSZmr %0, 1, $noreg, 0, $noreg, killed %4
+    RET 0
+
+...
+---
+name:            test_32bitext
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr64, preferred-register: '' }
+  - { id: 1, class: vr512, preferred-register: '' }
+  - { id: 2, class: vr512, preferred-register: '' }
+  - { id: 3, class: vk64wm, preferred-register: '' }
+  - { id: 4, class: vr512, preferred-register: '' }
+  - { id: 5, class: gr32, preferred-register: '' }
+  - { id: 6, class: gr32, preferred-register: '' }
+  - { id: 7, class: gr32, preferred-register: '' }
+liveins:
+  - { reg: '$rdi', virtual-reg: '%0' }
+  - { reg: '$zmm0', virtual-reg: '%1' }
+  - { reg: '$zmm1', virtual-reg: '%2' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 4294967295
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0:
+    liveins: $rdi, $zmm0, $zmm1
+
+    ; CHECK-LABEL: name: test_32bitext
+    ; CHECK: liveins: $rdi, $zmm0, $zmm1
+    ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY $rdi
+    ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY $zmm0
+    ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY $zmm1
+    ; CHECK: [[KMOVBkm:%[0-9]+]]:vk8 = KMOVBkm [[COPY]], 1, $noreg, 0, $noreg
+    ; CHECK: [[COPY3:%[0-9]+]]:vk32 = COPY [[KMOVBkm]]
+    ; CHECK: [[KMOVWkm:%[0-9]+]]:vk16 = KMOVWkm [[COPY]], 1, $noreg, 0, $noreg
+    ; CHECK: [[COPY4:%[0-9]+]]:vk32 = COPY [[KMOVWkm]]
+    ; CHECK: [[KADDDrr:%[0-9]+]]:vk32 = KADDDrr [[COPY3]], [[COPY4]]
+    ; CHECK: [[COPY5:%[0-9]+]]:vk64wm = COPY [[KADDDrr]]
+    ; CHECK: [[VMOVDQU16Zrrk:%[0-9]+]]:vr512 = VMOVDQU16Zrrk [[COPY2]], killed [[COPY5]], [[COPY1]]
+    ; CHECK: VMOVDQA32Zmr [[COPY]], 1, $noreg, 0, $noreg, killed [[VMOVDQU16Zrrk]]
+    ; CHECK: RET 0
+    %0 = COPY $rdi
+    %1 = COPY $zmm0
+    %2 = COPY $zmm1
+
+    %5 = MOVZX32rm8 %0, 1, $noreg, 0, $noreg
+    %6 = MOVZX32rm16 %0, 1, $noreg, 0, $noreg
+    %7 = ADD32rr %5, %6, implicit-def dead $eflags
+
+    %3 = COPY %7
+    %4 = VMOVDQU16Zrrk %2, killed %3, %1
+    VMOVDQA32Zmr %0, 1, $noreg, 0, $noreg, killed %4
+    RET 0
+
+...
+---
+name:            test_64bitext
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr64, preferred-register: '' }
+  - { id: 1, class: vr512, preferred-register: '' }
+  - { id: 2, class: vr512, preferred-register: '' }
+  - { id: 3, class: vk64wm, preferred-register: '' }
+  - { id: 4, class: vr512, preferred-register: '' }
+  - { id: 5, class: gr64, preferred-register: '' }
+  - { id: 6, class: gr64, preferred-register: '' }
+  - { id: 7, class: gr64, preferred-register: '' }
+liveins:
+  - { reg: '$rdi', virtual-reg: '%0' }
+  - { reg: '$zmm0', virtual-reg: '%1' }
+  - { reg: '$zmm1', virtual-reg: '%2' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 4294967295
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+stack:
+constants:
+body:             |
+  bb.0:
+    liveins: $rdi, $zmm0, $zmm1
+
+    ; CHECK-LABEL: name: test_64bitext
+    ; CHECK: liveins: $rdi, $zmm0, $zmm1
+    ; CHECK: [[COPY:%[0-9]+]]:gr64 = COPY $rdi
+    ; CHECK: [[COPY1:%[0-9]+]]:vr512 = COPY $zmm0
+    ; CHECK: [[COPY2:%[0-9]+]]:vr512 = COPY $zmm1
+    ; CHECK: [[KMOVBkm:%[0-9]+]]:vk8 = KMOVBkm [[COPY]], 1, $noreg, 0, $noreg
+    ; CHECK: [[COPY3:%[0-9]+]]:vk64 = COPY [[KMOVBkm]]
+    ; CHECK: [[KMOVWkm:%[0-9]+]]:vk16 = KMOVWkm [[COPY]], 1, $noreg, 0, $noreg
+    ; CHECK: [[COPY4:%[0-9]+]]:vk64 = COPY [[KMOVWkm]]
+    ; CHECK: [[KADDQrr:%[0-9]+]]:vk64 = KADDQrr [[COPY3]], [[COPY4]]
+    ; CHECK: [[COPY5:%[0-9]+]]:vk64wm = COPY [[KADDQrr]]
+    ; CHECK: [[VMOVDQU8Zrrk:%[0-9]+]]:vr512 = VMOVDQU8Zrrk [[COPY2]], killed [[COPY5]], [[COPY1]]
+    ; CHECK: VMOVDQA32Zmr [[COPY]], 1, $noreg, 0, $noreg, killed [[VMOVDQU8Zrrk]]
+    ; CHECK: RET 0
+    %0 = COPY $rdi
+    %1 = COPY $zmm0
+    %2 = COPY $zmm1
+
+    %5 = MOVZX64rm8 %0, 1, $noreg, 0, $noreg
+    %6 = MOVZX64rm16 %0, 1, $noreg, 0, $noreg
+    %7 = ADD64rr %5, %6, implicit-def dead $eflags
+
+    %3 = COPY %7
+    %4 = VMOVDQU8Zrrk %2, killed %3, %1
+    VMOVDQA32Zmr %0, 1, $noreg, 0, $noreg, killed %4
+    RET 0
+
+...
+---
+name:            test_unused
+alignment:       16
+exposesReturnsTwice: false
+legalized:       true
+regBankSelected: true
+selected:        true
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+# Note that this test is supposed to have registers without classes
+  - { id: 0, class: _, preferred-register: '' }
+  - { id: 1, class: _, preferred-register: '' }
+  - { id: 2, class: _, preferred-register: '' }
+liveins:
+  - { reg: '$rdi', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.1 (%ir-block.1):
+    liveins: $rdi
+
+    RET 0
+
+...
diff --git a/llvm/test/CodeGen/X86/dagcombine-shifts.ll b/llvm/test/CodeGen/X86/dagcombine-shifts.ll
index 42b325dd4c22..734abfe55a4e 100644
--- a/llvm/test/CodeGen/X86/dagcombine-shifts.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-shifts.ll
@@ -322,5 +322,132 @@ define void @g(i32 %a) nounwind {
   ret void
 }
 
+define i32 @shift_zext_shl(i8 zeroext %x) {
+; X86-LABEL: shift_zext_shl:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $64, %eax
+; X86-NEXT:    shll $9, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: shift_zext_shl:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $64, %eax
+; X64-NEXT:    shll $9, %eax
+; X64-NEXT:    retq
+  %a = and i8 %x, 64
+  %b = zext i8 %a to i16
+  %c = shl i16 %b, 9
+  %d = zext i16 %c to i32
+  ret i32 %d
+}
+
+define i32 @shift_zext_shl2(i8 zeroext %x) {
+; X86-LABEL: shift_zext_shl2:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $64, %eax
+; X86-NEXT:    shll $9, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: shift_zext_shl2:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $64, %eax
+; X64-NEXT:    shll $9, %eax
+; X64-NEXT:    retq
+  %a = and i8 %x, 64
+  %b = zext i8 %a to i32
+  %c = shl i32 %b, 9
+  ret i32 %c
+}
+
+define <4 x i32> @shift_zext_shl_vec(<4 x i8> %x) nounwind {
+; X86-LABEL: shift_zext_shl_vec:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $64, %ecx
+; X86-NEXT:    shll $9, %ecx
+; X86-NEXT:    andl $63, %edx
+; X86-NEXT:    shll $8, %edx
+; X86-NEXT:    andl $31, %esi
+; X86-NEXT:    shll $7, %esi
+; X86-NEXT:    andl $23, %edi
+; X86-NEXT:    shll $6, %edi
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: shift_zext_shl_vec:
+; X64:       # %bb.0:
+; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    pxor %xmm1, %xmm1
+; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT:    retq
+  %a = and <4 x i8> %x, <i8 64, i8 63, i8 31, i8 23>
+  %b = zext <4 x i8> %a to <4 x i16>
+  %c = shl <4 x i16> %b, <i16 9, i16 8, i16 7, i16 6>
+  %d = zext <4 x i16> %c to <4 x i32>
+  ret <4 x i32> %d
+}
+
+define <4 x i32> @shift_zext_shl2_vec(<4 x i8> %x) nounwind {
+; X86-LABEL: shift_zext_shl2_vec:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $23, %edi
+; X86-NEXT:    andl $31, %esi
+; X86-NEXT:    andl $63, %edx
+; X86-NEXT:    andl $64, %ecx
+; X86-NEXT:    shll $9, %ecx
+; X86-NEXT:    shll $8, %edx
+; X86-NEXT:    shll $7, %esi
+; X86-NEXT:    shll $6, %edi
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: shift_zext_shl2_vec:
+; X64:       # %bb.0:
+; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    pxor %xmm1, %xmm1
+; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; X64-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X64-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X64-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X64-NEXT:    retq
+  %a = and <4 x i8> %x, <i8 64, i8 63, i8 31, i8 23>
+  %b = zext <4 x i8> %a to <4 x i32>
+  %c = shl <4 x i32> %b, <i32 9, i32 8, i32 7, i32 6>
+  ret <4 x i32> %c
+}
+
 declare dso_local void @f(i64)
 
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 5da18ee6ad7c..01056a8b2c24 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -2369,6 +2369,31 @@ define void @PR41097() {
   ret void
 }
 
+; FIXME - should use INSERTPS
+define <2 x float> @PR86068(<2 x float> %0, <2 x float> %1) {
+; SSE2-LABEL: PR86068:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,1]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,1]
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: PR86068:
+; SSE42:       # %bb.0: # %entry
+; SSE42-NEXT:    movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE42-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: PR86068:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX-NEXT:    retq
+entry:
+  %3 = shufflevector <2 x float> %1, <2 x float> poison, <2 x i32> <i32 1, i32 poison>
+  %4 = shufflevector <2 x float> %3, <2 x float> %0, <2 x i32> <i32 0, i32 3>
+  ret <2 x float> %4
+}
+
 define void @D107009(ptr %input, ptr %output) {
 ; SSE-LABEL: D107009:
 ; SSE:       # %bb.0:
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s b/llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s
deleted file mode 100644
index fdfbf65c0e3c..000000000000
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vinterp.s
+++ /dev/null
@@ -1,278 +0,0 @@
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -show-encoding %s | FileCheck -check-prefix=GCN %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck -check-prefix=GCN %s
-
-v_interp_p10_f32 v0, v1, v2, v3
-// GCN: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:0  ; encoding: [0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_f32 v1, v10, v20, v30
-// GCN: v_interp_p10_f32 v1, v10, v20, v30 wait_exp:0  ; encoding: [0x01,0x00,0x00,0xcd,0x0a,0x29,0x7a,0x04]
-
-v_interp_p10_f32 v2, v11, v21, v31
-// GCN: v_interp_p10_f32 v2, v11, v21, v31 wait_exp:0  ; encoding: [0x02,0x00,0x00,0xcd,0x0b,0x2b,0x7e,0x04]
-
-v_interp_p10_f32 v3, v12, v22, v32
-// GCN: v_interp_p10_f32 v3, v12, v22, v32 wait_exp:0 ; encoding: [0x03,0x00,0x00,0xcd,0x0c,0x2d,0x82,0x04]
-
-v_interp_p10_f32 v0, v1, v2, v3 clamp
-// GCN: v_interp_p10_f32 v0, v1, v2, v3 clamp wait_exp:0 ; encoding: [0x00,0x80,0x00,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_f32 v0, -v1, v2, v3
-// GCN: v_interp_p10_f32 v0, -v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x24]
-
-v_interp_p10_f32 v0, v1, -v2, v3
-// GCN: v_interp_p10_f32 v0, v1, -v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x44]
-
-v_interp_p10_f32 v0, v1, v2, -v3
-// GCN: v_interp_p10_f32 v0, v1, v2, -v3 wait_exp:0 ; encoding: [0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x84]
-
-v_interp_p10_f32 v0, v1, v2, v3 wait_exp:0
-// GCN: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_f32 v0, v1, v2, v3 wait_exp:1
-// GCN: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:1 ; encoding: [0x00,0x01,0x00,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_f32 v0, v1, v2, v3 wait_exp:7
-// GCN: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:7 ; encoding: [0x00,0x07,0x00,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_f32 v0, v1, v2, v3 clamp wait_exp:7
-// GCN: v_interp_p10_f32 v0, v1, v2, v3 clamp wait_exp:7 ; encoding: [0x00,0x87,0x00,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_f32 v0, v1, v2, v3
-// GCN: v_interp_p2_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_f32 v1, v10, v20, v30
-// GCN: v_interp_p2_f32 v1, v10, v20, v30 wait_exp:0 ; encoding: [0x01,0x00,0x01,0xcd,0x0a,0x29,0x7a,0x04]
-
-v_interp_p2_f32 v2, v11, v21, v31
-// GCN: v_interp_p2_f32 v2, v11, v21, v31 wait_exp:0 ; encoding: [0x02,0x00,0x01,0xcd,0x0b,0x2b,0x7e,0x04]
-
-v_interp_p2_f32 v3, v12, v22, v32
-// GCN: v_interp_p2_f32 v3, v12, v22, v32 wait_exp:0 ; encoding: [0x03,0x00,0x01,0xcd,0x0c,0x2d,0x82,0x04]
-
-v_interp_p2_f32 v0, v1, v2, v3 clamp
-// GCN: v_interp_p2_f32 v0, v1, v2, v3 clamp wait_exp:0 ; encoding: [0x00,0x80,0x01,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_f32 v0, -v1, v2, v3
-// GCN: v_interp_p2_f32 v0, -v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x24]
-
-v_interp_p2_f32 v0, v1, -v2, v3
-// GCN: v_interp_p2_f32 v0, v1, -v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x44]
-
-v_interp_p2_f32 v0, v1, v2, -v3
-// GCN: v_interp_p2_f32 v0, v1, v2, -v3 wait_exp:0 ; encoding: [0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x84]
-
-v_interp_p2_f32 v0, v1, v2, v3 wait_exp:0
-// GCN: v_interp_p2_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_f32 v0, v1, v2, v3 wait_exp:1
-// GCN: v_interp_p2_f32 v0, v1, v2, v3 wait_exp:1 ; encoding: [0x00,0x01,0x01,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_f32 v0, v1, v2, v3 wait_exp:7
-// GCN: v_interp_p2_f32 v0, v1, v2, v3 wait_exp:7 ; encoding: [0x00,0x07,0x01,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_f32 v0, v1, v2, v3 clamp wait_exp:7
-// GCN: v_interp_p2_f32 v0, v1, v2, v3 clamp wait_exp:7 ; encoding: [0x00,0x87,0x01,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_f16_f32 v0, v1, v2, v3
-// GCN: v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_f16_f32 v0, -v1, v2, v3
-// GCN: v_interp_p10_f16_f32 v0, -v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x24]
-
-v_interp_p10_f16_f32 v0, v1, -v2, v3
-// GCN: v_interp_p10_f16_f32 v0, v1, -v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x44]
-
-v_interp_p10_f16_f32 v0, v1, v2, -v3
-// GCN: v_interp_p10_f16_f32 v0, v1, v2, -v3 wait_exp:0 ; encoding: [0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x84]
-
-v_interp_p10_f16_f32 v0, v1, v2, v3 clamp
-// GCN: v_interp_p10_f16_f32 v0, v1, v2, v3 clamp wait_exp:0 ; encoding: [0x00,0x80,0x02,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:0
-// GCN: v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:1
-// GCN: v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:1 ; encoding: [0x00,0x01,0x02,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:7
-// GCN: v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:7 ; encoding: [0x00,0x07,0x02,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,0]
-// GCN: v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,0]
-// GCN: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,0] wait_exp:0 ; encoding: [0x00,0x08,0x02,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[0,1,0,0]
-// GCN: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[0,1,0,0] wait_exp:0 ; encoding: [0x00,0x10,0x02,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[0,0,1,0]
-// GCN: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[0,0,1,0] wait_exp:0 ; encoding: [0x00,0x20,0x02,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,1]
-// GCN: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,1] wait_exp:0 ; encoding: [0x00,0x40,0x02,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[1,1,1,1]
-// GCN: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[1,1,1,1] wait_exp:0 ; encoding: [0x00,0x78,0x02,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,1] wait_exp:5
-// GCN: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,1] wait_exp:5 ; encoding: [0x00,0x4d,0x02,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_f16_f32 v0, v1, v2, v3 clamp op_sel:[1,0,0,1] wait_exp:5
-// GCN: v_interp_p10_f16_f32 v0, v1, v2, v3 clamp op_sel:[1,0,0,1] wait_exp:5 ; encoding: [0x00,0xcd,0x02,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_f16_f32 v0, -v1, -v2, -v3 clamp op_sel:[1,0,0,1] wait_exp:5
-// GCN: v_interp_p10_f16_f32 v0, -v1, -v2, -v3 clamp op_sel:[1,0,0,1] wait_exp:5 ; encoding: [0x00,0xcd,0x02,0xcd,0x01,0x05,0x0e,0xe4]
-
-v_interp_p2_f16_f32 v0, v1, v2, v3
-// GCN: v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_f16_f32 v0, -v1, v2, v3
-// GCN: v_interp_p2_f16_f32 v0, -v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x24]
-
-v_interp_p2_f16_f32 v0, v1, -v2, v3
-// GCN: v_interp_p2_f16_f32 v0, v1, -v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x44]
-
-v_interp_p2_f16_f32 v0, v1, v2, -v3
-// GCN: v_interp_p2_f16_f32 v0, v1, v2, -v3 wait_exp:0 ; encoding: [0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x84]
-
-v_interp_p2_f16_f32 v0, v1, v2, v3 clamp
-// GCN: v_interp_p2_f16_f32 v0, v1, v2, v3 clamp wait_exp:0 ; encoding: [0x00,0x80,0x03,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:0
-// GCN: v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:1
-// GCN: v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:1 ; encoding: [0x00,0x01,0x03,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:7
-// GCN: v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:7 ; encoding: [0x00,0x07,0x03,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,0]
-// GCN: v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,0]
-// GCN: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,0] wait_exp:0 ; encoding: [0x00,0x08,0x03,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[0,1,0,0]
-// GCN: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[0,1,0,0] wait_exp:0 ; encoding: [0x00,0x10,0x03,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[0,0,1,0]
-// GCN: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[0,0,1,0] wait_exp:0 ; encoding: [0x00,0x20,0x03,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,1]
-// GCN: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,1] wait_exp:0 ; encoding: [0x00,0x40,0x03,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[1,1,1,1]
-// GCN: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[1,1,1,1] wait_exp:0 ; encoding: [0x00,0x78,0x03,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,1] wait_exp:5
-// GCN: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,1] wait_exp:5 ; encoding: [0x00,0x4d,0x03,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_f16_f32 v0, v1, v2, v3 clamp op_sel:[1,0,0,1] wait_exp:5
-// GCN: v_interp_p2_f16_f32 v0, v1, v2, v3 clamp op_sel:[1,0,0,1] wait_exp:5 ; encoding: [0x00,0xcd,0x03,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_f16_f32 v0, -v1, -v2, -v3 clamp op_sel:[1,0,0,1] wait_exp:5
-// GCN: v_interp_p2_f16_f32 v0, -v1, -v2, -v3 clamp op_sel:[1,0,0,1] wait_exp:5 ; encoding: [0x00,0xcd,0x03,0xcd,0x01,0x05,0x0e,0xe4]
-
-v_interp_p10_rtz_f16_f32 v0, v1, v2, v3
-// GCN: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_rtz_f16_f32 v0, -v1, v2, v3
-// GCN: v_interp_p10_rtz_f16_f32 v0, -v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x24]
-
-v_interp_p10_rtz_f16_f32 v0, v1, -v2, v3
-// GCN: v_interp_p10_rtz_f16_f32 v0, v1, -v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x44]
-
-v_interp_p10_rtz_f16_f32 v0, v1, v2, -v3
-// GCN: v_interp_p10_rtz_f16_f32 v0, v1, v2, -v3 wait_exp:0 ; encoding: [0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x84]
-
-v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 clamp
-// GCN: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 clamp wait_exp:0 ; encoding: [0x00,0x80,0x04,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0
-// GCN: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:1
-// GCN: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:1 ; encoding: [0x00,0x01,0x04,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:7
-// GCN: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:7 ; encoding: [0x00,0x07,0x04,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,0]
-// GCN: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,0]
-// GCN: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,0] wait_exp:0 ; encoding: [0x00,0x08,0x04,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,1,0,0]
-// GCN: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,1,0,0] wait_exp:0 ; encoding: [0x00,0x10,0x04,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,1,0]
-// GCN: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,1,0] wait_exp:0 ; encoding: [0x00,0x20,0x04,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,1]
-// GCN: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,1] wait_exp:0 ; encoding: [0x00,0x40,0x04,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,1,1,1]
-// GCN: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,1,1,1] wait_exp:0 ; encoding: [0x00,0x78,0x04,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,1] wait_exp:5
-// GCN: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,1] wait_exp:5 ; encoding: [0x00,0x4d,0x04,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 clamp op_sel:[1,0,0,1] wait_exp:5
-// GCN: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 clamp op_sel:[1,0,0,1] wait_exp:5 ; encoding: [0x00,0xcd,0x04,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p10_rtz_f16_f32 v0, -v1, -v2, -v3 clamp op_sel:[1,0,0,1] wait_exp:5
-// GCN: v_interp_p10_rtz_f16_f32 v0, -v1, -v2, -v3 clamp op_sel:[1,0,0,1] wait_exp:5 ; encoding: [0x00,0xcd,0x04,0xcd,0x01,0x05,0x0e,0xe4]
-
-v_interp_p2_rtz_f16_f32 v0, v1, v2, v3
-// GCN: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_rtz_f16_f32 v0, -v1, v2, v3
-// GCN: v_interp_p2_rtz_f16_f32 v0, -v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x24]
-
-v_interp_p2_rtz_f16_f32 v0, v1, -v2, v3
-// GCN: v_interp_p2_rtz_f16_f32 v0, v1, -v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x44]
-
-v_interp_p2_rtz_f16_f32 v0, v1, v2, -v3
-// GCN: v_interp_p2_rtz_f16_f32 v0, v1, v2, -v3 wait_exp:0 ; encoding: [0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x84]
-
-v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 clamp
-// GCN: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 clamp wait_exp:0 ; encoding: [0x00,0x80,0x05,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0
-// GCN: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:1
-// GCN: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:1 ; encoding: [0x00,0x01,0x05,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:7
-// GCN: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:7 ; encoding: [0x00,0x07,0x05,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,0]
-// GCN: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,0]
-// GCN: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,0] wait_exp:0 ; encoding: [0x00,0x08,0x05,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,1,0,0]
-// GCN: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,1,0,0] wait_exp:0 ; encoding: [0x00,0x10,0x05,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,1,0]
-// GCN: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,1,0] wait_exp:0 ; encoding: [0x00,0x20,0x05,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,1]
-// GCN: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,1] wait_exp:0 ; encoding: [0x00,0x40,0x05,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,1,1,1]
-// GCN: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,1,1,1] wait_exp:0 ; encoding: [0x00,0x78,0x05,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,1] wait_exp:5
-// GCN: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,1] wait_exp:5 ; encoding: [0x00,0x4d,0x05,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 clamp op_sel:[1,0,0,1] wait_exp:5
-// GCN: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 clamp op_sel:[1,0,0,1] wait_exp:5 ; encoding: [0x00,0xcd,0x05,0xcd,0x01,0x05,0x0e,0x04]
-
-v_interp_p2_rtz_f16_f32 v0, -v1, -v2, -v3 clamp op_sel:[1,0,0,1] wait_exp:5
-// GCN: v_interp_p2_rtz_f16_f32 v0, -v1, -v2, -v3 clamp op_sel:[1,0,0,1] wait_exp:5 ; encoding: [0x00,0xcd,0x05,0xcd,0x01,0x05,0x0e,0xe4]
diff --git a/llvm/test/MC/AMDGPU/vinterp-fake16.s b/llvm/test/MC/AMDGPU/vinterp-fake16.s
new file mode 100644
index 000000000000..33dacdd92c31
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/vinterp-fake16.s
@@ -0,0 +1,182 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -show-encoding %s | FileCheck -check-prefix=GCN %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s | FileCheck -check-prefix=GCN %s
+
+v_interp_p10_f32 v0, v1, v2, v3
+// GCN: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:0  ; encoding: [0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p10_f32 v1, v10, v20, v30
+// GCN: v_interp_p10_f32 v1, v10, v20, v30 wait_exp:0  ; encoding: [0x01,0x00,0x00,0xcd,0x0a,0x29,0x7a,0x04]
+
+v_interp_p10_f32 v2, v11, v21, v31
+// GCN: v_interp_p10_f32 v2, v11, v21, v31 wait_exp:0  ; encoding: [0x02,0x00,0x00,0xcd,0x0b,0x2b,0x7e,0x04]
+
+v_interp_p10_f32 v3, v12, v22, v32
+// GCN: v_interp_p10_f32 v3, v12, v22, v32 wait_exp:0 ; encoding: [0x03,0x00,0x00,0xcd,0x0c,0x2d,0x82,0x04]
+
+v_interp_p10_f32 v0, v1, v2, v3 clamp
+// GCN: v_interp_p10_f32 v0, v1, v2, v3 clamp wait_exp:0 ; encoding: [0x00,0x80,0x00,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p10_f32 v0, -v1, v2, v3
+// GCN: v_interp_p10_f32 v0, -v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x24]
+
+v_interp_p10_f32 v0, v1, -v2, v3
+// GCN: v_interp_p10_f32 v0, v1, -v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x44]
+
+v_interp_p10_f32 v0, v1, v2, -v3
+// GCN: v_interp_p10_f32 v0, v1, v2, -v3 wait_exp:0 ; encoding: [0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x84]
+
+v_interp_p10_f32 v0, v1, v2, v3 wait_exp:0
+// GCN: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p10_f32 v0, v1, v2, v3 wait_exp:1
+// GCN: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:1 ; encoding: [0x00,0x01,0x00,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p10_f32 v0, v1, v2, v3 wait_exp:7
+// GCN: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:7 ; encoding: [0x00,0x07,0x00,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p10_f32 v0, v1, v2, v3 clamp wait_exp:7
+// GCN: v_interp_p10_f32 v0, v1, v2, v3 clamp wait_exp:7 ; encoding: [0x00,0x87,0x00,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p2_f32 v0, v1, v2, v3
+// GCN: v_interp_p2_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p2_f32 v1, v10, v20, v30
+// GCN: v_interp_p2_f32 v1, v10, v20, v30 wait_exp:0 ; encoding: [0x01,0x00,0x01,0xcd,0x0a,0x29,0x7a,0x04]
+
+v_interp_p2_f32 v2, v11, v21, v31
+// GCN: v_interp_p2_f32 v2, v11, v21, v31 wait_exp:0 ; encoding: [0x02,0x00,0x01,0xcd,0x0b,0x2b,0x7e,0x04]
+
+v_interp_p2_f32 v3, v12, v22, v32
+// GCN: v_interp_p2_f32 v3, v12, v22, v32 wait_exp:0 ; encoding: [0x03,0x00,0x01,0xcd,0x0c,0x2d,0x82,0x04]
+
+v_interp_p2_f32 v0, v1, v2, v3 clamp
+// GCN: v_interp_p2_f32 v0, v1, v2, v3 clamp wait_exp:0 ; encoding: [0x00,0x80,0x01,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p2_f32 v0, -v1, v2, v3
+// GCN: v_interp_p2_f32 v0, -v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x24]
+
+v_interp_p2_f32 v0, v1, -v2, v3
+// GCN: v_interp_p2_f32 v0, v1, -v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x44]
+
+v_interp_p2_f32 v0, v1, v2, -v3
+// GCN: v_interp_p2_f32 v0, v1, v2, -v3 wait_exp:0 ; encoding: [0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x84]
+
+v_interp_p2_f32 v0, v1, v2, v3 wait_exp:0
+// GCN: v_interp_p2_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p2_f32 v0, v1, v2, v3 wait_exp:1
+// GCN: v_interp_p2_f32 v0, v1, v2, v3 wait_exp:1 ; encoding: [0x00,0x01,0x01,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p2_f32 v0, v1, v2, v3 wait_exp:7
+// GCN: v_interp_p2_f32 v0, v1, v2, v3 wait_exp:7 ; encoding: [0x00,0x07,0x01,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p2_f32 v0, v1, v2, v3 clamp wait_exp:7
+// GCN: v_interp_p2_f32 v0, v1, v2, v3 clamp wait_exp:7 ; encoding: [0x00,0x87,0x01,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p10_f16_f32 v0, v1, v2, v3
+// GFX11: v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p10_f16_f32 v0, -v1, v2, v3
+// GFX11: v_interp_p10_f16_f32 v0, -v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x24]
+
+v_interp_p10_f16_f32 v0, v1, -v2, v3
+// GFX11: v_interp_p10_f16_f32 v0, v1, -v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x44]
+
+v_interp_p10_f16_f32 v0, v1, v2, -v3
+// GFX11: v_interp_p10_f16_f32 v0, v1, v2, -v3 wait_exp:0 ; encoding: [0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x84]
+
+v_interp_p10_f16_f32 v0, v1, v2, v3 clamp
+// GFX11: v_interp_p10_f16_f32 v0, v1, v2, v3 clamp wait_exp:0 ; encoding: [0x00,0x80,0x02,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:0
+// GFX11: v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:1
+// GFX11: v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:1 ; encoding: [0x00,0x01,0x02,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:7
+// GFX11: v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:7 ; encoding: [0x00,0x07,0x02,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p10_f16_f32 v0, v1, v2, v3
+// GFX11: v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p2_f16_f32 v0, v1, v2, v3
+// GFX11: v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p2_f16_f32 v0, -v1, v2, v3
+// GFX11: v_interp_p2_f16_f32 v0, -v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x24]
+
+v_interp_p2_f16_f32 v0, v1, -v2, v3
+// GFX11: v_interp_p2_f16_f32 v0, v1, -v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x44]
+
+v_interp_p2_f16_f32 v0, v1, v2, -v3
+// GFX11: v_interp_p2_f16_f32 v0, v1, v2, -v3 wait_exp:0 ; encoding: [0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x84]
+
+v_interp_p2_f16_f32 v0, v1, v2, v3 clamp
+// GFX11: v_interp_p2_f16_f32 v0, v1, v2, v3 clamp wait_exp:0 ; encoding: [0x00,0x80,0x03,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:0
+// GFX11: v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:1
+// GFX11: v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:1 ; encoding: [0x00,0x01,0x03,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:7
+// GFX11: v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:7 ; encoding: [0x00,0x07,0x03,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p2_f16_f32 v0, v1, v2, v3
+// GFX11: v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p10_rtz_f16_f32 v0, v1, v2, v3
+// GFX11: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p10_rtz_f16_f32 v0, -v1, v2, v3
+// GFX11: v_interp_p10_rtz_f16_f32 v0, -v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x24]
+
+v_interp_p10_rtz_f16_f32 v0, v1, -v2, v3
+// GFX11: v_interp_p10_rtz_f16_f32 v0, v1, -v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x44]
+
+v_interp_p10_rtz_f16_f32 v0, v1, v2, -v3
+// GFX11: v_interp_p10_rtz_f16_f32 v0, v1, v2, -v3 wait_exp:0 ; encoding: [0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x84]
+
+v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 clamp
+// GFX11: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 clamp wait_exp:0 ; encoding: [0x00,0x80,0x04,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0
+// GFX11: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:1
+// GFX11: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:1 ; encoding: [0x00,0x01,0x04,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:7
+// GFX11: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:7 ; encoding: [0x00,0x07,0x04,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p10_rtz_f16_f32 v0, v1, v2, v3
+// GFX11: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p2_rtz_f16_f32 v0, v1, v2, v3
+// GFX11: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p2_rtz_f16_f32 v0, -v1, v2, v3
+// GFX11: v_interp_p2_rtz_f16_f32 v0, -v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x24]
+
+v_interp_p2_rtz_f16_f32 v0, v1, -v2, v3
+// GFX11: v_interp_p2_rtz_f16_f32 v0, v1, -v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x44]
+
+v_interp_p2_rtz_f16_f32 v0, v1, v2, -v3
+// GFX11: v_interp_p2_rtz_f16_f32 v0, v1, v2, -v3 wait_exp:0 ; encoding: [0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x84]
+
+v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 clamp
+// GFX11: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 clamp wait_exp:0 ; encoding: [0x00,0x80,0x05,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0
+// GFX11: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:1
+// GFX11: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:1 ; encoding: [0x00,0x01,0x05,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:7
+// GFX11: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:7 ; encoding: [0x00,0x07,0x05,0xcd,0x01,0x05,0x0e,0x04]
+
+v_interp_p2_rtz_f16_f32 v0, v1, v2, v3
+// GFX11: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x04]
diff --git a/llvm/test/MC/COFF/dwarf5lineinfo.s b/llvm/test/MC/COFF/dwarf5lineinfo.s
new file mode 100644
index 000000000000..f0789feb2085
--- /dev/null
+++ b/llvm/test/MC/COFF/dwarf5lineinfo.s
@@ -0,0 +1,13 @@
+// RUN: llvm-mc -filetype=obj -triple x86_64-pc-windows-gnu %s -o - | llvm-readobj -r  - | FileCheck %s
+
+// CHECK: Relocations [
+// CHECK:  Section (4) .debug_line {
+// CHECK:    0x22 IMAGE_REL_AMD64_SECREL .debug_line_str (8)
+// CHECK:    0x2C IMAGE_REL_AMD64_SECREL .debug_line_str (8)
+// CHECK:    0x36 IMAGE_REL_AMD64_ADDR64 .text (0)
+// CHECK:  }
+
+main:
+	.file	0 "/" "test.c"
+	.loc	0 1 0
+	retq
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vinterp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vinterp.txt
deleted file mode 100644
index b22fd5e289fa..000000000000
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vinterp.txt
+++ /dev/null
@@ -1,251 +0,0 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble %s | FileCheck -strict-whitespace -check-prefix=GFX11 %s
-
-# GFX11: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x04
-
-# Check that unused bits in the encoding are ignored.
-# GFX11: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x80,0xcd,0x01,0x05,0x0e,0x1c
-
-# GFX11: v_interp_p10_f32 v1, v10, v20, v30 wait_exp:0{{$}}
-0x01,0x00,0x00,0xcd,0x0a,0x29,0x7a,0x04
-
-# GFX11: v_interp_p10_f32 v2, v11, v21, v31 wait_exp:0{{$}}
-0x02,0x00,0x00,0xcd,0x0b,0x2b,0x7e,0x04
-
-# GFX11: v_interp_p10_f32 v3, v12, v22, v32 wait_exp:0{{$}}
-0x03,0x00,0x00,0xcd,0x0c,0x2d,0x82,0x04
-
-# GFX11: v_interp_p10_f32 v0, v1, v2, v3 clamp wait_exp:0{{$}}
-0x00,0x80,0x00,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_f32 v0, -v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x24
-
-# GFX11: v_interp_p10_f32 v0, v1, -v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x44
-
-# GFX11: v_interp_p10_f32 v0, v1, v2, -v3 wait_exp:0{{$}}
-0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x84
-
-# GFX11: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:1{{$}}
-0x00,0x01,0x00,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:7{{$}}
-0x00,0x07,0x00,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_f32 v0, v1, v2, v3 clamp wait_exp:7{{$}}
-0x00,0x87,0x00,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_f32 v0, v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_f32 v1, v10, v20, v30 wait_exp:0{{$}}
-0x01,0x00,0x01,0xcd,0x0a,0x29,0x7a,0x04
-
-# GFX11: v_interp_p2_f32 v2, v11, v21, v31 wait_exp:0{{$}}
-0x02,0x00,0x01,0xcd,0x0b,0x2b,0x7e,0x04
-
-# GFX11: v_interp_p2_f32 v3, v12, v22, v32 wait_exp:0{{$}}
-0x03,0x00,0x01,0xcd,0x0c,0x2d,0x82,0x04
-
-# GFX11: v_interp_p2_f32 v0, v1, v2, v3 clamp wait_exp:0{{$}}
-0x00,0x80,0x01,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_f32 v0, -v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x24
-
-# GFX11: v_interp_p2_f32 v0, v1, -v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x44
-
-# GFX11: v_interp_p2_f32 v0, v1, v2, -v3 wait_exp:0{{$}}
-0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x84
-
-# GFX11: v_interp_p2_f32 v0, v1, v2, v3 wait_exp:1{{$}}
-0x00,0x01,0x01,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_f32 v0, v1, v2, v3 wait_exp:7{{$}}
-0x00,0x07,0x01,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_f32 v0, v1, v2, v3 clamp wait_exp:7{{$}}
-0x00,0x87,0x01,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_f16_f32 v0, -v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x24
-
-# GFX11: v_interp_p10_f16_f32 v0, v1, -v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x44
-
-# GFX11: v_interp_p10_f16_f32 v0, v1, v2, -v3 wait_exp:0{{$}}
-0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x84
-
-# GFX11: v_interp_p10_f16_f32 v0, v1, v2, v3 clamp wait_exp:0{{$}}
-0x00,0x80,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:1{{$}}
-0x00,0x01,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:7{{$}}
-0x00,0x07,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,0] wait_exp:0{{$}}
-0x00,0x08,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[0,1,0,0] wait_exp:0{{$}}
-0x00,0x10,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[0,0,1,0] wait_exp:0{{$}}
-0x00,0x20,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,1] wait_exp:0{{$}}
-0x00,0x40,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[1,1,1,1] wait_exp:0{{$}}
-0x00,0x78,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0x4d,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_f16_f32 v0, v1, v2, v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0xcd,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_f16_f32 v0, -v1, -v2, -v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0xcd,0x02,0xcd,0x01,0x05,0x0e,0xe4
-
-# GFX11: v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_f16_f32 v0, -v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x24
-
-# GFX11: v_interp_p2_f16_f32 v0, v1, -v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x44
-
-# GFX11: v_interp_p2_f16_f32 v0, v1, v2, -v3 wait_exp:0{{$}}
-0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x84
-
-# GFX11: v_interp_p2_f16_f32 v0, v1, v2, v3 clamp wait_exp:0{{$}}
-0x00,0x80,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:1{{$}}
-0x00,0x01,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:7{{$}}
-0x00,0x07,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,0] wait_exp:0{{$}}
-0x00,0x08,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[0,1,0,0] wait_exp:0{{$}}
-0x00,0x10,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[0,0,1,0] wait_exp:0{{$}}
-0x00,0x20,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,1] wait_exp:0{{$}}
-0x00,0x40,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[1,1,1,1] wait_exp:0{{$}}
-0x00,0x78,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0x4d,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_f16_f32 v0, v1, v2, v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0xcd,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_f16_f32 v0, -v1, -v2, -v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0xcd,0x03,0xcd,0x01,0x05,0x0e,0xe4
-
-# GFX11: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_rtz_f16_f32 v0, -v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x24
-
-# GFX11: v_interp_p10_rtz_f16_f32 v0, v1, -v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x44
-
-# GFX11: v_interp_p10_rtz_f16_f32 v0, v1, v2, -v3 wait_exp:0{{$}}
-0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x84
-
-# GFX11: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 clamp wait_exp:0{{$}}
-0x00,0x80,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:1{{$}}
-0x00,0x01,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:7{{$}}
-0x00,0x07,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,0] wait_exp:0{{$}}
-0x00,0x08,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,1,0,0] wait_exp:0{{$}}
-0x00,0x10,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,1,0] wait_exp:0{{$}}
-0x00,0x20,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,1] wait_exp:0{{$}}
-0x00,0x40,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,1,1,1] wait_exp:0{{$}}
-0x00,0x78,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0x4d,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0xcd,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p10_rtz_f16_f32 v0, -v1, -v2, -v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0xcd,0x04,0xcd,0x01,0x05,0x0e,0xe4
-
-# GFX11: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_rtz_f16_f32 v0, -v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x24
-
-# GFX11: v_interp_p2_rtz_f16_f32 v0, v1, -v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x44
-
-# GFX11: v_interp_p2_rtz_f16_f32 v0, v1, v2, -v3 wait_exp:0{{$}}
-0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x84
-
-# GFX11: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 clamp wait_exp:0{{$}}
-0x00,0x80,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:1{{$}}
-0x00,0x01,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:7{{$}}
-0x00,0x07,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,0] wait_exp:0{{$}}
-0x00,0x08,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,1,0,0] wait_exp:0{{$}}
-0x00,0x10,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,1,0] wait_exp:0{{$}}
-0x00,0x20,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,1] wait_exp:0{{$}}
-0x00,0x40,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,1,1,1] wait_exp:0{{$}}
-0x00,0x78,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0x4d,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0xcd,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX11: v_interp_p2_rtz_f16_f32 v0, -v1, -v2, -v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0xcd,0x05,0xcd,0x01,0x05,0x0e,0xe4
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt
index 0c4427cff63e..1be97b242284 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_features.txt
@@ -12,8 +12,13 @@
 # GFX12: v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05]
 0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05
 
-# GFX1150: v_add3_u32_e64_dpp v5, v1, s2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x0c,0x00,0x01,0x77,0x39,0x05]
+# GFX12: v_add3_u32_e64_dpp v5, v1, s2, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x0c,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x55,0xd6,0xe9,0x04,0x0c,0x00,0x01,0x77,0x39,0x05
 
-# GFX1150: v_cmp_ne_i32_e64_dpp vcc_lo, v1, s2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x45,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12: v_cmp_ne_i32_e64_dpp vcc_lo, v1, s2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x6a,0x00,0x45,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05]
 0x6a,0x00,0x45,0xd4,0xe9,0x04,0x00,0x00,0x01,0x77,0x39,0x05
+
+# Check that unused bits in the encoding are ignored.
+# This is more strict than the check in vinterp-fake16.txt and is GFX12 specific.
+# GFX12: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:0 ; encoding: [0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x04]
+0x00,0x00,0xe0,0xcd,0x01,0x05,0x0e,0x1c
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vinterp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vinterp.txt
deleted file mode 100644
index 977cd732947c..000000000000
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vinterp.txt
+++ /dev/null
@@ -1,251 +0,0 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble %s | FileCheck -strict-whitespace -check-prefix=GFX12 %s
-
-# GFX12: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x04
-
-# Check that unused bits in the encoding are ignored.
-# GFX12: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0xe0,0xcd,0x01,0x05,0x0e,0x1c
-
-# GFX12: v_interp_p10_f32 v1, v10, v20, v30 wait_exp:0{{$}}
-0x01,0x00,0x00,0xcd,0x0a,0x29,0x7a,0x04
-
-# GFX12: v_interp_p10_f32 v2, v11, v21, v31 wait_exp:0{{$}}
-0x02,0x00,0x00,0xcd,0x0b,0x2b,0x7e,0x04
-
-# GFX12: v_interp_p10_f32 v3, v12, v22, v32 wait_exp:0{{$}}
-0x03,0x00,0x00,0xcd,0x0c,0x2d,0x82,0x04
-
-# GFX12: v_interp_p10_f32 v0, v1, v2, v3 clamp wait_exp:0{{$}}
-0x00,0x80,0x00,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_f32 v0, -v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x24
-
-# GFX12: v_interp_p10_f32 v0, v1, -v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x44
-
-# GFX12: v_interp_p10_f32 v0, v1, v2, -v3 wait_exp:0{{$}}
-0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x84
-
-# GFX12: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:1{{$}}
-0x00,0x01,0x00,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:7{{$}}
-0x00,0x07,0x00,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_f32 v0, v1, v2, v3 clamp wait_exp:7{{$}}
-0x00,0x87,0x00,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_f32 v0, v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_f32 v1, v10, v20, v30 wait_exp:0{{$}}
-0x01,0x00,0x01,0xcd,0x0a,0x29,0x7a,0x04
-
-# GFX12: v_interp_p2_f32 v2, v11, v21, v31 wait_exp:0{{$}}
-0x02,0x00,0x01,0xcd,0x0b,0x2b,0x7e,0x04
-
-# GFX12: v_interp_p2_f32 v3, v12, v22, v32 wait_exp:0{{$}}
-0x03,0x00,0x01,0xcd,0x0c,0x2d,0x82,0x04
-
-# GFX12: v_interp_p2_f32 v0, v1, v2, v3 clamp wait_exp:0{{$}}
-0x00,0x80,0x01,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_f32 v0, -v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x24
-
-# GFX12: v_interp_p2_f32 v0, v1, -v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x44
-
-# GFX12: v_interp_p2_f32 v0, v1, v2, -v3 wait_exp:0{{$}}
-0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x84
-
-# GFX12: v_interp_p2_f32 v0, v1, v2, v3 wait_exp:1{{$}}
-0x00,0x01,0x01,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_f32 v0, v1, v2, v3 wait_exp:7{{$}}
-0x00,0x07,0x01,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_f32 v0, v1, v2, v3 clamp wait_exp:7{{$}}
-0x00,0x87,0x01,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_f16_f32 v0, -v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x24
-
-# GFX12: v_interp_p10_f16_f32 v0, v1, -v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x44
-
-# GFX12: v_interp_p10_f16_f32 v0, v1, v2, -v3 wait_exp:0{{$}}
-0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x84
-
-# GFX12: v_interp_p10_f16_f32 v0, v1, v2, v3 clamp wait_exp:0{{$}}
-0x00,0x80,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:1{{$}}
-0x00,0x01,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:7{{$}}
-0x00,0x07,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,0] wait_exp:0{{$}}
-0x00,0x08,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[0,1,0,0] wait_exp:0{{$}}
-0x00,0x10,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[0,0,1,0] wait_exp:0{{$}}
-0x00,0x20,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,1] wait_exp:0{{$}}
-0x00,0x40,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[1,1,1,1] wait_exp:0{{$}}
-0x00,0x78,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0x4d,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_f16_f32 v0, v1, v2, v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0xcd,0x02,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_f16_f32 v0, -v1, -v2, -v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0xcd,0x02,0xcd,0x01,0x05,0x0e,0xe4
-
-# GFX12: v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_f16_f32 v0, -v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x24
-
-# GFX12: v_interp_p2_f16_f32 v0, v1, -v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x44
-
-# GFX12: v_interp_p2_f16_f32 v0, v1, v2, -v3 wait_exp:0{{$}}
-0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x84
-
-# GFX12: v_interp_p2_f16_f32 v0, v1, v2, v3 clamp wait_exp:0{{$}}
-0x00,0x80,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:1{{$}}
-0x00,0x01,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:7{{$}}
-0x00,0x07,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,0] wait_exp:0{{$}}
-0x00,0x08,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[0,1,0,0] wait_exp:0{{$}}
-0x00,0x10,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[0,0,1,0] wait_exp:0{{$}}
-0x00,0x20,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,1] wait_exp:0{{$}}
-0x00,0x40,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[1,1,1,1] wait_exp:0{{$}}
-0x00,0x78,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0x4d,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_f16_f32 v0, v1, v2, v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0xcd,0x03,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_f16_f32 v0, -v1, -v2, -v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0xcd,0x03,0xcd,0x01,0x05,0x0e,0xe4
-
-# GFX12: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_rtz_f16_f32 v0, -v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x24
-
-# GFX12: v_interp_p10_rtz_f16_f32 v0, v1, -v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x44
-
-# GFX12: v_interp_p10_rtz_f16_f32 v0, v1, v2, -v3 wait_exp:0{{$}}
-0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x84
-
-# GFX12: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 clamp wait_exp:0{{$}}
-0x00,0x80,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:1{{$}}
-0x00,0x01,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:7{{$}}
-0x00,0x07,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,0] wait_exp:0{{$}}
-0x00,0x08,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,1,0,0] wait_exp:0{{$}}
-0x00,0x10,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,1,0] wait_exp:0{{$}}
-0x00,0x20,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,1] wait_exp:0{{$}}
-0x00,0x40,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,1,1,1] wait_exp:0{{$}}
-0x00,0x78,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0x4d,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0xcd,0x04,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p10_rtz_f16_f32 v0, -v1, -v2, -v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0xcd,0x04,0xcd,0x01,0x05,0x0e,0xe4
-
-# GFX12: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_rtz_f16_f32 v0, -v1, v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x24
-
-# GFX12: v_interp_p2_rtz_f16_f32 v0, v1, -v2, v3 wait_exp:0{{$}}
-0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x44
-
-# GFX12: v_interp_p2_rtz_f16_f32 v0, v1, v2, -v3 wait_exp:0{{$}}
-0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x84
-
-# GFX12: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 clamp wait_exp:0{{$}}
-0x00,0x80,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:1{{$}}
-0x00,0x01,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:7{{$}}
-0x00,0x07,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,0] wait_exp:0{{$}}
-0x00,0x08,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,1,0,0] wait_exp:0{{$}}
-0x00,0x10,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,1,0] wait_exp:0{{$}}
-0x00,0x20,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,1] wait_exp:0{{$}}
-0x00,0x40,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,1,1,1] wait_exp:0{{$}}
-0x00,0x78,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0x4d,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0xcd,0x05,0xcd,0x01,0x05,0x0e,0x04
-
-# GFX12: v_interp_p2_rtz_f16_f32 v0, -v1, -v2, -v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
-0x00,0xcd,0x05,0xcd,0x01,0x05,0x0e,0xe4
diff --git a/llvm/test/MC/Disassembler/AMDGPU/vinterp-fake16.txt b/llvm/test/MC/Disassembler/AMDGPU/vinterp-fake16.txt
new file mode 100644
index 000000000000..239f1d8b3058
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/vinterp-fake16.txt
@@ -0,0 +1,252 @@
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble %s | FileCheck -strict-whitespace -check-prefix=CHECK %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -disassemble %s | FileCheck -strict-whitespace -check-prefix=CHECK %s
+
+# CHECK: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:0{{$}}
+0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x04
+
+# Check that unused bits in the encoding are ignored.
+# CHECK: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:0{{$}}
+0x00,0x00,0x80,0xcd,0x01,0x05,0x0e,0x1c
+
+# CHECK: v_interp_p10_f32 v1, v10, v20, v30 wait_exp:0{{$}}
+0x01,0x00,0x00,0xcd,0x0a,0x29,0x7a,0x04
+
+# CHECK: v_interp_p10_f32 v2, v11, v21, v31 wait_exp:0{{$}}
+0x02,0x00,0x00,0xcd,0x0b,0x2b,0x7e,0x04
+
+# CHECK: v_interp_p10_f32 v3, v12, v22, v32 wait_exp:0{{$}}
+0x03,0x00,0x00,0xcd,0x0c,0x2d,0x82,0x04
+
+# CHECK: v_interp_p10_f32 v0, v1, v2, v3 clamp wait_exp:0{{$}}
+0x00,0x80,0x00,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_f32 v0, -v1, v2, v3 wait_exp:0{{$}}
+0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x24
+
+# CHECK: v_interp_p10_f32 v0, v1, -v2, v3 wait_exp:0{{$}}
+0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x44
+
+# CHECK: v_interp_p10_f32 v0, v1, v2, -v3 wait_exp:0{{$}}
+0x00,0x00,0x00,0xcd,0x01,0x05,0x0e,0x84
+
+# CHECK: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:1{{$}}
+0x00,0x01,0x00,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_f32 v0, v1, v2, v3 wait_exp:7{{$}}
+0x00,0x07,0x00,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_f32 v0, v1, v2, v3 clamp wait_exp:7{{$}}
+0x00,0x87,0x00,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_f32 v0, v1, v2, v3 wait_exp:0{{$}}
+0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_f32 v1, v10, v20, v30 wait_exp:0{{$}}
+0x01,0x00,0x01,0xcd,0x0a,0x29,0x7a,0x04
+
+# CHECK: v_interp_p2_f32 v2, v11, v21, v31 wait_exp:0{{$}}
+0x02,0x00,0x01,0xcd,0x0b,0x2b,0x7e,0x04
+
+# CHECK: v_interp_p2_f32 v3, v12, v22, v32 wait_exp:0{{$}}
+0x03,0x00,0x01,0xcd,0x0c,0x2d,0x82,0x04
+
+# CHECK: v_interp_p2_f32 v0, v1, v2, v3 clamp wait_exp:0{{$}}
+0x00,0x80,0x01,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_f32 v0, -v1, v2, v3 wait_exp:0{{$}}
+0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x24
+
+# CHECK: v_interp_p2_f32 v0, v1, -v2, v3 wait_exp:0{{$}}
+0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x44
+
+# CHECK: v_interp_p2_f32 v0, v1, v2, -v3 wait_exp:0{{$}}
+0x00,0x00,0x01,0xcd,0x01,0x05,0x0e,0x84
+
+# CHECK: v_interp_p2_f32 v0, v1, v2, v3 wait_exp:1{{$}}
+0x00,0x01,0x01,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_f32 v0, v1, v2, v3 wait_exp:7{{$}}
+0x00,0x07,0x01,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_f32 v0, v1, v2, v3 clamp wait_exp:7{{$}}
+0x00,0x87,0x01,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:0{{$}}
+0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_f16_f32 v0, -v1, v2, v3 wait_exp:0{{$}}
+0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x24
+
+# CHECK: v_interp_p10_f16_f32 v0, v1, -v2, v3 wait_exp:0{{$}}
+0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x44
+
+# CHECK: v_interp_p10_f16_f32 v0, v1, v2, -v3 wait_exp:0{{$}}
+0x00,0x00,0x02,0xcd,0x01,0x05,0x0e,0x84
+
+# CHECK: v_interp_p10_f16_f32 v0, v1, v2, v3 clamp wait_exp:0{{$}}
+0x00,0x80,0x02,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:1{{$}}
+0x00,0x01,0x02,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_f16_f32 v0, v1, v2, v3 wait_exp:7{{$}}
+0x00,0x07,0x02,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,0] wait_exp:0{{$}}
+0x00,0x08,0x02,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[0,1,0,0] wait_exp:0{{$}}
+0x00,0x10,0x02,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[0,0,1,0] wait_exp:0{{$}}
+0x00,0x20,0x02,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,1] wait_exp:0{{$}}
+0x00,0x40,0x02,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[1,1,1,1] wait_exp:0{{$}}
+0x00,0x78,0x02,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,1] wait_exp:5{{$}}
+0x00,0x4d,0x02,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_f16_f32 v0, v1, v2, v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
+0x00,0xcd,0x02,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_f16_f32 v0, -v1, -v2, -v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
+0x00,0xcd,0x02,0xcd,0x01,0x05,0x0e,0xe4
+
+# CHECK: v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:0{{$}}
+0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_f16_f32 v0, -v1, v2, v3 wait_exp:0{{$}}
+0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x24
+
+# CHECK: v_interp_p2_f16_f32 v0, v1, -v2, v3 wait_exp:0{{$}}
+0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x44
+
+# CHECK: v_interp_p2_f16_f32 v0, v1, v2, -v3 wait_exp:0{{$}}
+0x00,0x00,0x03,0xcd,0x01,0x05,0x0e,0x84
+
+# CHECK: v_interp_p2_f16_f32 v0, v1, v2, v3 clamp wait_exp:0{{$}}
+0x00,0x80,0x03,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:1{{$}}
+0x00,0x01,0x03,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_f16_f32 v0, v1, v2, v3 wait_exp:7{{$}}
+0x00,0x07,0x03,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,0] wait_exp:0{{$}}
+0x00,0x08,0x03,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[0,1,0,0] wait_exp:0{{$}}
+0x00,0x10,0x03,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[0,0,1,0] wait_exp:0{{$}}
+0x00,0x20,0x03,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,1] wait_exp:0{{$}}
+0x00,0x40,0x03,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[1,1,1,1] wait_exp:0{{$}}
+0x00,0x78,0x03,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,1] wait_exp:5{{$}}
+0x00,0x4d,0x03,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_f16_f32 v0, v1, v2, v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
+0x00,0xcd,0x03,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_f16_f32 v0, -v1, -v2, -v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
+0x00,0xcd,0x03,0xcd,0x01,0x05,0x0e,0xe4
+
+# CHECK: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0{{$}}
+0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_rtz_f16_f32 v0, -v1, v2, v3 wait_exp:0{{$}}
+0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x24
+
+# CHECK: v_interp_p10_rtz_f16_f32 v0, v1, -v2, v3 wait_exp:0{{$}}
+0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x44
+
+# CHECK: v_interp_p10_rtz_f16_f32 v0, v1, v2, -v3 wait_exp:0{{$}}
+0x00,0x00,0x04,0xcd,0x01,0x05,0x0e,0x84
+
+# CHECK: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 clamp wait_exp:0{{$}}
+0x00,0x80,0x04,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:1{{$}}
+0x00,0x01,0x04,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 wait_exp:7{{$}}
+0x00,0x07,0x04,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,0] wait_exp:0{{$}}
+0x00,0x08,0x04,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,1,0,0] wait_exp:0{{$}}
+0x00,0x10,0x04,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,1,0] wait_exp:0{{$}}
+0x00,0x20,0x04,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,1] wait_exp:0{{$}}
+0x00,0x40,0x04,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,1,1,1] wait_exp:0{{$}}
+0x00,0x78,0x04,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,1] wait_exp:5{{$}}
+0x00,0x4d,0x04,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_rtz_f16_f32 v0, v1, v2, v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
+0x00,0xcd,0x04,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p10_rtz_f16_f32 v0, -v1, -v2, -v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
+0x00,0xcd,0x04,0xcd,0x01,0x05,0x0e,0xe4
+
+# CHECK: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:0{{$}}
+0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_rtz_f16_f32 v0, -v1, v2, v3 wait_exp:0{{$}}
+0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x24
+
+# CHECK: v_interp_p2_rtz_f16_f32 v0, v1, -v2, v3 wait_exp:0{{$}}
+0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x44
+
+# CHECK: v_interp_p2_rtz_f16_f32 v0, v1, v2, -v3 wait_exp:0{{$}}
+0x00,0x00,0x05,0xcd,0x01,0x05,0x0e,0x84
+
+# CHECK: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 clamp wait_exp:0{{$}}
+0x00,0x80,0x05,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:1{{$}}
+0x00,0x01,0x05,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 wait_exp:7{{$}}
+0x00,0x07,0x05,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,0] wait_exp:0{{$}}
+0x00,0x08,0x05,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,1,0,0] wait_exp:0{{$}}
+0x00,0x10,0x05,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,1,0] wait_exp:0{{$}}
+0x00,0x20,0x05,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[0,0,0,1] wait_exp:0{{$}}
+0x00,0x40,0x05,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,1,1,1] wait_exp:0{{$}}
+0x00,0x78,0x05,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 op_sel:[1,0,0,1] wait_exp:5{{$}}
+0x00,0x4d,0x05,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_rtz_f16_f32 v0, v1, v2, v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
+0x00,0xcd,0x05,0xcd,0x01,0x05,0x0e,0x04
+
+# CHECK: v_interp_p2_rtz_f16_f32 v0, -v1, -v2, -v3 clamp op_sel:[1,0,0,1] wait_exp:5{{$}}
+0x00,0xcd,0x05,0xcd,0x01,0x05,0x0e,0xe4
diff --git a/llvm/test/Transforms/Attributor/align.ll b/llvm/test/Transforms/Attributor/align.ll
index 5103b6f1f1e9..9880e53fd43a 100644
--- a/llvm/test/Transforms/Attributor/align.ll
+++ b/llvm/test/Transforms/Attributor/align.ll
@@ -11,10 +11,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; TEST 1
 ;;
 ;.
-; CHECK: @[[A1:[a-zA-Z0-9_$"\\.-]+]] = common global i8 0, align 8
-; CHECK: @[[A2:[a-zA-Z0-9_$"\\.-]+]] = common global i8 0, align 16
-; CHECK: @[[CND:[a-zA-Z0-9_$"\\.-]+]] = external global i1
-; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = global i8 0, align 32
+; CHECK: @a1 = common global i8 0, align 8
+; CHECK: @a2 = common global i8 0, align 16
+; CHECK: @cnd = external global i1
+; CHECK: @G = global i8 0, align 32
 ;.
 define ptr @test1(ptr align 8 %0) #0 {
 ; CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable
@@ -158,18 +158,31 @@ define internal ptr @f1(ptr readnone %0) local_unnamed_addr #0 {
 
 ; Function Attrs: nounwind readnone ssp uwtable
 define ptr @f2(ptr readnone %0) local_unnamed_addr #0 {
-; CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable
-; CHECK-LABEL: define {{[^@]+}}@f2
-; CHECK-SAME: (ptr nofree readnone [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
-; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP4:%.*]], label [[TMP3:%.*]]
-; CHECK:       3:
-; CHECK-NEXT:    br label [[TMP5:%.*]]
-; CHECK:       4:
-; CHECK-NEXT:    br label [[TMP5]]
-; CHECK:       5:
-; CHECK-NEXT:    [[TMP6:%.*]] = phi ptr [ [[TMP0]], [[TMP3]] ], [ @a1, [[TMP4]] ]
-; CHECK-NEXT:    ret ptr [[TMP6]]
+; TUNIT: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@f2
+; TUNIT-SAME: (ptr nofree readnone [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; TUNIT-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
+; TUNIT-NEXT:    br i1 [[TMP2]], label [[TMP4:%.*]], label [[TMP3:%.*]]
+; TUNIT:       3:
+; TUNIT-NEXT:    br label [[TMP5:%.*]]
+; TUNIT:       4:
+; TUNIT-NEXT:    br label [[TMP5]]
+; TUNIT:       5:
+; TUNIT-NEXT:    [[TMP6:%.*]] = phi ptr [ [[TMP0]], [[TMP3]] ], [ @a1, [[TMP4]] ]
+; TUNIT-NEXT:    ret ptr [[TMP6]]
+;
+; CGSCC: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@f2
+; CGSCC-SAME: (ptr nofree readnone [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CGSCC-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null
+; CGSCC-NEXT:    br i1 [[TMP2]], label [[TMP4:%.*]], label [[TMP3:%.*]]
+; CGSCC:       3:
+; CGSCC-NEXT:    br label [[TMP5:%.*]]
+; CGSCC:       4:
+; CGSCC-NEXT:    br label [[TMP5]]
+; CGSCC:       5:
+; CGSCC-NEXT:    [[TMP6:%.*]] = phi ptr [ [[TMP0]], [[TMP3]] ], [ @a1, [[TMP4]] ]
+; CGSCC-NEXT:    ret ptr [[TMP6]]
 ;
   %2 = icmp eq ptr %0, null
   br i1 %2, label %5, label %3
@@ -222,7 +235,7 @@ define align 4 ptr @test7() #0 {
 ; CGSCC: Function Attrs: mustprogress nofree noinline nosync nounwind willreturn memory(none) uwtable
 ; CGSCC-LABEL: define {{[^@]+}}@test7
 ; CGSCC-SAME: () #[[ATTR1:[0-9]+]] {
-; CGSCC-NEXT:    [[C:%.*]] = tail call noundef nonnull align 8 dereferenceable(1) ptr @f1() #[[ATTR14:[0-9]+]]
+; CGSCC-NEXT:    [[C:%.*]] = tail call noundef nonnull align 8 dereferenceable(1) ptr @f1() #[[ATTR15:[0-9]+]]
 ; CGSCC-NEXT:    ret ptr [[C]]
 ;
   %c = tail call ptr @f1(ptr align 8 dereferenceable(1) @a1)
@@ -933,7 +946,7 @@ define i32 @musttail_caller_1(ptr %p) {
 ; TUNIT-NEXT:    [[C:%.*]] = load i1, ptr @cnd, align 1
 ; TUNIT-NEXT:    br i1 [[C]], label [[MT:%.*]], label [[EXIT:%.*]]
 ; TUNIT:       mt:
-; TUNIT-NEXT:    [[V:%.*]] = musttail call i32 @musttail_callee_1(ptr nocapture nofree noundef readonly [[P]]) #[[ATTR12:[0-9]+]]
+; TUNIT-NEXT:    [[V:%.*]] = musttail call i32 @musttail_callee_1(ptr nocapture nofree noundef readonly [[P]]) #[[ATTR13:[0-9]+]]
 ; TUNIT-NEXT:    ret i32 [[V]]
 ; TUNIT:       exit:
 ; TUNIT-NEXT:    ret i32 0
@@ -944,7 +957,7 @@ define i32 @musttail_caller_1(ptr %p) {
 ; CGSCC-NEXT:    [[C:%.*]] = load i1, ptr @cnd, align 1
 ; CGSCC-NEXT:    br i1 [[C]], label [[MT:%.*]], label [[EXIT:%.*]]
 ; CGSCC:       mt:
-; CGSCC-NEXT:    [[V:%.*]] = musttail call i32 @musttail_callee_1(ptr nocapture nofree noundef nonnull readonly dereferenceable(4) [[P]]) #[[ATTR15:[0-9]+]]
+; CGSCC-NEXT:    [[V:%.*]] = musttail call i32 @musttail_callee_1(ptr nocapture nofree noundef nonnull readonly dereferenceable(4) [[P]]) #[[ATTR16:[0-9]+]]
 ; CGSCC-NEXT:    ret i32 [[V]]
 ; CGSCC:       exit:
 ; CGSCC-NEXT:    ret i32 0
@@ -1076,13 +1089,13 @@ define ptr @aligned_8_return_caller(ptr align(16) %a, i1 %c1, i1 %c2) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
 ; TUNIT-LABEL: define {{[^@]+}}@aligned_8_return_caller
 ; TUNIT-SAME: (ptr nofree readnone align 16 "no-capture-maybe-returned" [[A:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]]) #[[ATTR10]] {
-; TUNIT-NEXT:    [[R:%.*]] = call align 8 ptr @aligned_8_return(ptr noalias nofree readnone align 16 "no-capture-maybe-returned" [[A]], i1 noundef [[C1]], i1 [[C2]]) #[[ATTR13:[0-9]+]]
+; TUNIT-NEXT:    [[R:%.*]] = call align 8 ptr @aligned_8_return(ptr noalias nofree readnone align 16 "no-capture-maybe-returned" [[A]], i1 noundef [[C1]], i1 [[C2]]) #[[ATTR14:[0-9]+]]
 ; TUNIT-NEXT:    ret ptr [[R]]
 ;
 ; CGSCC: Function Attrs: mustprogress nofree nosync nounwind willreturn memory(none)
 ; CGSCC-LABEL: define {{[^@]+}}@aligned_8_return_caller
 ; CGSCC-SAME: (ptr nofree readnone align 16 [[A:%.*]], i1 noundef [[C1:%.*]], i1 [[C2:%.*]]) #[[ATTR13:[0-9]+]] {
-; CGSCC-NEXT:    [[R:%.*]] = call align 8 ptr @aligned_8_return(ptr noalias nofree readnone align 16 [[A]], i1 noundef [[C1]], i1 [[C2]]) #[[ATTR14]]
+; CGSCC-NEXT:    [[R:%.*]] = call align 8 ptr @aligned_8_return(ptr noalias nofree readnone align 16 [[A]], i1 noundef [[C1]], i1 [[C2]]) #[[ATTR15]]
 ; CGSCC-NEXT:    ret ptr [[R]]
 ;
   %r = call ptr @aligned_8_return(ptr %a, i1 %c1, i1 %c2)
@@ -1101,6 +1114,104 @@ entry:
   ret i32 0
 }
 
+define i64 @infer_align_atomicrmw(ptr align 4 %p) {
+; TUNIT: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+; TUNIT-LABEL: define {{[^@]+}}@infer_align_atomicrmw
+; TUNIT-SAME: (ptr nocapture nofree align 16 [[P:%.*]]) #[[ATTR12:[0-9]+]] {
+; TUNIT-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; TUNIT-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr i64, ptr [[ARRAYIDX0]], i64 3
+; TUNIT-NEXT:    [[RET:%.*]] = atomicrmw add ptr [[ARRAYIDX1]], i64 4 seq_cst, align 16
+; TUNIT-NEXT:    ret i64 [[RET]]
+;
+; CGSCC: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+; CGSCC-LABEL: define {{[^@]+}}@infer_align_atomicrmw
+; CGSCC-SAME: (ptr nocapture nofree align 16 [[P:%.*]]) #[[ATTR14:[0-9]+]] {
+; CGSCC-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; CGSCC-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr i64, ptr [[ARRAYIDX0]], i64 3
+; CGSCC-NEXT:    [[RET:%.*]] = atomicrmw add ptr [[ARRAYIDX1]], i64 4 seq_cst, align 16
+; CGSCC-NEXT:    ret i64 [[RET]]
+;
+  %arrayidx0 = getelementptr i64, ptr %p, i64 1
+  %arrayidx1 = getelementptr i64, ptr %arrayidx0, i64 3
+  %ret = atomicrmw add ptr %arrayidx1, i64 4 seq_cst, align 16
+  ret i64 %ret
+}
+
+define ptr @infer_align_atomicrmw_ptr(ptr align 4 %p, ptr %val) {
+; TUNIT: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+; TUNIT-LABEL: define {{[^@]+}}@infer_align_atomicrmw_ptr
+; TUNIT-SAME: (ptr nocapture nofree align 16 [[P:%.*]], ptr nofree [[VAL:%.*]]) #[[ATTR12]] {
+; TUNIT-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; TUNIT-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr i64, ptr [[ARRAYIDX0]], i64 3
+; TUNIT-NEXT:    [[RET:%.*]] = atomicrmw xchg ptr [[ARRAYIDX1]], ptr [[VAL]] seq_cst, align 16
+; TUNIT-NEXT:    ret ptr [[RET]]
+;
+; CGSCC: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+; CGSCC-LABEL: define {{[^@]+}}@infer_align_atomicrmw_ptr
+; CGSCC-SAME: (ptr nocapture nofree align 16 [[P:%.*]], ptr nofree [[VAL:%.*]]) #[[ATTR14]] {
+; CGSCC-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; CGSCC-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr i64, ptr [[ARRAYIDX0]], i64 3
+; CGSCC-NEXT:    [[RET:%.*]] = atomicrmw xchg ptr [[ARRAYIDX1]], ptr [[VAL]] seq_cst, align 16
+; CGSCC-NEXT:    ret ptr [[RET]]
+;
+  %arrayidx0 = getelementptr i64, ptr %p, i64 1
+  %arrayidx1 = getelementptr i64, ptr %arrayidx0, i64 3
+  %ret = atomicrmw xchg ptr %arrayidx1, ptr %val seq_cst, align 16
+  ret ptr %ret
+}
+
+define i64 @infer_align_cmpxchg(ptr align 4 %p) {
+; TUNIT: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+; TUNIT-LABEL: define {{[^@]+}}@infer_align_cmpxchg
+; TUNIT-SAME: (ptr nocapture nofree align 16 [[P:%.*]]) #[[ATTR12]] {
+; TUNIT-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; TUNIT-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr i64, ptr [[ARRAYIDX0]], i64 3
+; TUNIT-NEXT:    [[CMPX:%.*]] = cmpxchg ptr [[ARRAYIDX1]], i64 4, i64 1 seq_cst seq_cst, align 16
+; TUNIT-NEXT:    [[RET:%.*]] = extractvalue { i64, i1 } [[CMPX]], 0
+; TUNIT-NEXT:    ret i64 [[RET]]
+;
+; CGSCC: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+; CGSCC-LABEL: define {{[^@]+}}@infer_align_cmpxchg
+; CGSCC-SAME: (ptr nocapture nofree align 16 [[P:%.*]]) #[[ATTR14]] {
+; CGSCC-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; CGSCC-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr i64, ptr [[ARRAYIDX0]], i64 3
+; CGSCC-NEXT:    [[CMPX:%.*]] = cmpxchg ptr [[ARRAYIDX1]], i64 4, i64 1 seq_cst seq_cst, align 16
+; CGSCC-NEXT:    [[RET:%.*]] = extractvalue { i64, i1 } [[CMPX]], 0
+; CGSCC-NEXT:    ret i64 [[RET]]
+;
+  %arrayidx0 = getelementptr i64, ptr %p, i64 1
+  %arrayidx1 = getelementptr i64, ptr %arrayidx0, i64 3
+  %cmpx = cmpxchg ptr %arrayidx1, i64 4, i64 1 seq_cst seq_cst, align 16
+  %ret = extractvalue { i64, i1 } %cmpx, 0
+  ret i64 %ret
+}
+
+define ptr @infer_align_cmpxchg_ptr(ptr align 4 %p, ptr %cmp0, ptr %cmp1) {
+; TUNIT: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+; TUNIT-LABEL: define {{[^@]+}}@infer_align_cmpxchg_ptr
+; TUNIT-SAME: (ptr nocapture nofree align 16 [[P:%.*]], ptr nofree [[CMP0:%.*]], ptr nofree [[CMP1:%.*]]) #[[ATTR12]] {
+; TUNIT-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; TUNIT-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr i64, ptr [[ARRAYIDX0]], i64 3
+; TUNIT-NEXT:    [[CMPX:%.*]] = cmpxchg ptr [[ARRAYIDX1]], ptr [[CMP0]], ptr [[CMP1]] seq_cst seq_cst, align 16
+; TUNIT-NEXT:    [[RET:%.*]] = extractvalue { ptr, i1 } [[CMPX]], 0
+; TUNIT-NEXT:    ret ptr [[RET]]
+;
+; CGSCC: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
+; CGSCC-LABEL: define {{[^@]+}}@infer_align_cmpxchg_ptr
+; CGSCC-SAME: (ptr nocapture nofree align 16 [[P:%.*]], ptr nofree [[CMP0:%.*]], ptr nofree [[CMP1:%.*]]) #[[ATTR14]] {
+; CGSCC-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; CGSCC-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr i64, ptr [[ARRAYIDX0]], i64 3
+; CGSCC-NEXT:    [[CMPX:%.*]] = cmpxchg ptr [[ARRAYIDX1]], ptr [[CMP0]], ptr [[CMP1]] seq_cst seq_cst, align 16
+; CGSCC-NEXT:    [[RET:%.*]] = extractvalue { ptr, i1 } [[CMPX]], 0
+; CGSCC-NEXT:    ret ptr [[RET]]
+;
+  %arrayidx0 = getelementptr i64, ptr %p, i64 1
+  %arrayidx1 = getelementptr i64, ptr %arrayidx0, i64 3
+  %cmpx = cmpxchg ptr %arrayidx1, ptr %cmp0, ptr %cmp1 seq_cst seq_cst, align 16
+  %ret = extractvalue { ptr, i1 } %cmpx, 0
+  ret ptr %ret
+}
+
 declare void @implicit_cast_callee(i64)
 
 attributes #0 = { nounwind uwtable noinline }
@@ -1119,8 +1230,9 @@ attributes #2 = { null_pointer_is_valid }
 ; TUNIT: attributes #[[ATTR9]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(write) }
 ; TUNIT: attributes #[[ATTR10]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; TUNIT: attributes #[[ATTR11]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(read) }
-; TUNIT: attributes #[[ATTR12]] = { nofree nosync nounwind willreturn memory(read) }
-; TUNIT: attributes #[[ATTR13]] = { nofree nosync nounwind willreturn }
+; TUNIT: attributes #[[ATTR12]] = { mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) }
+; TUNIT: attributes #[[ATTR13]] = { nofree nosync nounwind willreturn memory(read) }
+; TUNIT: attributes #[[ATTR14]] = { nofree nosync nounwind willreturn }
 ;.
 ; CGSCC: attributes #[[ATTR0]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable }
 ; CGSCC: attributes #[[ATTR1]] = { mustprogress nofree noinline nosync nounwind willreturn memory(none) uwtable }
@@ -1136,6 +1248,7 @@ attributes #2 = { null_pointer_is_valid }
 ; CGSCC: attributes #[[ATTR11]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR12]] = { mustprogress nofree nosync nounwind willreturn memory(read) }
 ; CGSCC: attributes #[[ATTR13]] = { mustprogress nofree nosync nounwind willreturn memory(none) }
-; CGSCC: attributes #[[ATTR14]] = { nofree nosync willreturn }
-; CGSCC: attributes #[[ATTR15]] = { nofree willreturn memory(read) }
+; CGSCC: attributes #[[ATTR14]] = { mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite) }
+; CGSCC: attributes #[[ATTR15]] = { nofree nosync willreturn }
+; CGSCC: attributes #[[ATTR16]] = { nofree willreturn memory(read) }
 ;.
diff --git a/llvm/test/Transforms/Attributor/nocapture-1.ll b/llvm/test/Transforms/Attributor/nocapture-1.ll
index 7d2f0a1351a4..f61388f71c46 100644
--- a/llvm/test/Transforms/Attributor/nocapture-1.ll
+++ b/llvm/test/Transforms/Attributor/nocapture-1.ll
@@ -524,13 +524,13 @@ define void @test6_2(ptr %x6_2, ptr %y6_2, ptr %z6_2) {
 define void @test_cmpxchg(ptr %p) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
 ; TUNIT-LABEL: define {{[^@]+}}@test_cmpxchg
-; TUNIT-SAME: (ptr nocapture nofree noundef nonnull dereferenceable(4) [[P:%.*]]) #[[ATTR8:[0-9]+]] {
+; TUNIT-SAME: (ptr nocapture nofree noundef nonnull align 4 dereferenceable(4) [[P:%.*]]) #[[ATTR8:[0-9]+]] {
 ; TUNIT-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[P]], i32 0, i32 1 acquire monotonic, align 4
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
 ; CGSCC-LABEL: define {{[^@]+}}@test_cmpxchg
-; CGSCC-SAME: (ptr nocapture nofree noundef nonnull dereferenceable(4) [[P:%.*]]) #[[ATTR11:[0-9]+]] {
+; CGSCC-SAME: (ptr nocapture nofree noundef nonnull align 4 dereferenceable(4) [[P:%.*]]) #[[ATTR11:[0-9]+]] {
 ; CGSCC-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[P]], i32 0, i32 1 acquire monotonic, align 4
 ; CGSCC-NEXT:    ret void
 ;
@@ -541,13 +541,13 @@ define void @test_cmpxchg(ptr %p) {
 define void @test_cmpxchg_ptr(ptr %p, ptr %q) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
 ; TUNIT-LABEL: define {{[^@]+}}@test_cmpxchg_ptr
-; TUNIT-SAME: (ptr nocapture nofree noundef nonnull dereferenceable(8) [[P:%.*]], ptr nofree [[Q:%.*]]) #[[ATTR8]] {
+; TUNIT-SAME: (ptr nocapture nofree noundef nonnull align 8 dereferenceable(8) [[P:%.*]], ptr nofree [[Q:%.*]]) #[[ATTR8]] {
 ; TUNIT-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[P]], ptr null, ptr [[Q]] acquire monotonic, align 8
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
 ; CGSCC-LABEL: define {{[^@]+}}@test_cmpxchg_ptr
-; CGSCC-SAME: (ptr nocapture nofree noundef nonnull dereferenceable(8) [[P:%.*]], ptr nofree [[Q:%.*]]) #[[ATTR11]] {
+; CGSCC-SAME: (ptr nocapture nofree noundef nonnull align 8 dereferenceable(8) [[P:%.*]], ptr nofree [[Q:%.*]]) #[[ATTR11]] {
 ; CGSCC-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[P]], ptr null, ptr [[Q]] acquire monotonic, align 8
 ; CGSCC-NEXT:    ret void
 ;
@@ -558,13 +558,13 @@ define void @test_cmpxchg_ptr(ptr %p, ptr %q) {
 define void @test_atomicrmw(ptr %p) {
 ; TUNIT: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
 ; TUNIT-LABEL: define {{[^@]+}}@test_atomicrmw
-; TUNIT-SAME: (ptr nocapture nofree noundef nonnull dereferenceable(4) [[P:%.*]]) #[[ATTR8]] {
+; TUNIT-SAME: (ptr nocapture nofree noundef nonnull align 4 dereferenceable(4) [[P:%.*]]) #[[ATTR8]] {
 ; TUNIT-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr [[P]], i32 1 seq_cst, align 4
 ; TUNIT-NEXT:    ret void
 ;
 ; CGSCC: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
 ; CGSCC-LABEL: define {{[^@]+}}@test_atomicrmw
-; CGSCC-SAME: (ptr nocapture nofree noundef nonnull dereferenceable(4) [[P:%.*]]) #[[ATTR11]] {
+; CGSCC-SAME: (ptr nocapture nofree noundef nonnull align 4 dereferenceable(4) [[P:%.*]]) #[[ATTR11]] {
 ; CGSCC-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr [[P]], i32 1 seq_cst, align 4
 ; CGSCC-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/Attributor/nofpclass.ll b/llvm/test/Transforms/Attributor/nofpclass.ll
index 442464cde389..4df647cf3bb5 100644
--- a/llvm/test/Transforms/Attributor/nofpclass.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass.ll
@@ -1813,7 +1813,7 @@ define double @fpext(float nofpclass(inf nan) %arg) {
 define float @atomicrmw_fadd(ptr %ptr, float nofpclass(inf nan) %val) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nounwind willreturn memory(argmem: readwrite)
 ; CHECK-LABEL: define float @atomicrmw_fadd
-; CHECK-SAME: (ptr nocapture nofree noundef nonnull dereferenceable(4) [[PTR:%.*]], float nofpclass(nan inf) [[VAL:%.*]]) #[[ATTR6:[0-9]+]] {
+; CHECK-SAME: (ptr nocapture nofree noundef nonnull align 4 dereferenceable(4) [[PTR:%.*]], float nofpclass(nan inf) [[VAL:%.*]]) #[[ATTR6:[0-9]+]] {
 ; CHECK-NEXT:    [[RESULT:%.*]] = atomicrmw fadd ptr [[PTR]], float [[VAL]] seq_cst, align 4
 ; CHECK-NEXT:    ret float [[RESULT]]
 ;
diff --git a/llvm/test/Transforms/Float2Int/pr79158.ll b/llvm/test/Transforms/Float2Int/pr79158.ll
new file mode 100644
index 000000000000..5e78cc0bc66f
--- /dev/null
+++ b/llvm/test/Transforms/Float2Int/pr79158.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=float2int -S | FileCheck %s
+
+define i32 @pr79158(i32 %x) {
+; CHECK-LABEL: define i32 @pr79158(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i1 [[CMP]] to i64
+; CHECK-NEXT:    [[MUL1:%.*]] = mul i64 [[TMP0]], 4294967295
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[MUL1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+entry:
+  %cmp = icmp sgt i32 %x, 0
+  %conv = uitofp i1 %cmp to double
+  %mul = fmul double %conv, 0x41EFFFFFFFE00000
+  %conv1 = fptoui double %mul to i32
+  ret i32 %conv1
+}
diff --git a/llvm/test/Transforms/InstCombine/add.ll b/llvm/test/Transforms/InstCombine/add.ll
index 522dcf8db27f..ec3aca26514c 100644
--- a/llvm/test/Transforms/InstCombine/add.ll
+++ b/llvm/test/Transforms/InstCombine/add.ll
@@ -3986,5 +3986,81 @@ define i32 @add_reduce_sqr_sum_varC_invalid2(i32 %a, i32 %b) {
   ret i32 %ab2
 }
 
+define i32 @fold_sext_addition_or_disjoint(i8 %x) {
+; CHECK-LABEL: @fold_sext_addition_or_disjoint(
+; CHECK-NEXT:    [[SE:%.*]] = sext i8 [[XX:%.*]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nsw i32 [[SE]], 1246
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %xx = or disjoint i8 %x, 12
+  %se = sext i8 %xx to i32
+  %r = add i32 %se, 1234
+  ret i32 %r
+}
+
+define i32 @fold_sext_addition_fail(i8 %x) {
+; CHECK-LABEL: @fold_sext_addition_fail(
+; CHECK-NEXT:    [[XX:%.*]] = or i8 [[X:%.*]], 12
+; CHECK-NEXT:    [[SE:%.*]] = sext i8 [[XX]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nsw i32 [[SE]], 1234
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %xx = or i8 %x, 12
+  %se = sext i8 %xx to i32
+  %r = add i32 %se, 1234
+  ret i32 %r
+}
+
+define i32 @fold_zext_addition_or_disjoint(i8 %x) {
+; CHECK-LABEL: @fold_zext_addition_or_disjoint(
+; CHECK-NEXT:    [[SE:%.*]] = zext i8 [[XX:%.*]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SE]], 1246
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %xx = or disjoint i8 %x, 12
+  %se = zext i8 %xx to i32
+  %r = add i32 %se, 1234
+  ret i32 %r
+}
+
+define i32 @fold_zext_addition_or_disjoint2(i8 %x) {
+; CHECK-LABEL: @fold_zext_addition_or_disjoint2(
+; CHECK-NEXT:    [[XX:%.*]] = add nuw i8 [[X:%.*]], 4
+; CHECK-NEXT:    [[SE:%.*]] = zext i8 [[XX]] to i32
+; CHECK-NEXT:    ret i32 [[SE]]
+;
+  %xx = or disjoint i8 %x, 18
+  %se = zext i8 %xx to i32
+  %r = add i32 %se, -14
+  ret i32 %r
+}
+
+define i32 @fold_zext_addition_fail(i8 %x) {
+; CHECK-LABEL: @fold_zext_addition_fail(
+; CHECK-NEXT:    [[XX:%.*]] = or i8 [[X:%.*]], 12
+; CHECK-NEXT:    [[SE:%.*]] = zext i8 [[XX]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nuw nsw i32 [[SE]], 1234
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %xx = or i8 %x, 12
+  %se = zext i8 %xx to i32
+  %r = add i32 %se, 1234
+  ret i32 %r
+}
+
+define i32 @fold_zext_addition_fail2(i8 %x) {
+; CHECK-LABEL: @fold_zext_addition_fail2(
+; CHECK-NEXT:    [[XX:%.*]] = or i8 [[X:%.*]], 18
+; CHECK-NEXT:    [[SE:%.*]] = zext i8 [[XX]] to i32
+; CHECK-NEXT:    [[R:%.*]] = add nsw i32 [[SE]], -14
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %xx = or i8 %x, 18
+  %se = zext i8 %xx to i32
+  %r = add i32 %se, -14
+  ret i32 %r
+}
+
+
 declare void @llvm.assume(i1)
 declare void @fake_func(i32)
diff --git a/llvm/test/Transforms/InstCombine/binop-itofp.ll b/llvm/test/Transforms/InstCombine/binop-itofp.ll
index 82cdb3ce6bee..cd9ec1e59203 100644
--- a/llvm/test/Transforms/InstCombine/binop-itofp.ll
+++ b/llvm/test/Transforms/InstCombine/binop-itofp.ll
@@ -1012,7 +1012,7 @@ define float @missed_nonzero_check_on_constant_for_si_fmul(i1 %c, i1 %.b, ptr %g
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i32 65529, i32 53264
 ; CHECK-NEXT:    [[CONV_I:%.*]] = trunc i32 [[SEL]] to i16
 ; CHECK-NEXT:    [[CONV1_I:%.*]] = sitofp i16 [[CONV_I]] to float
-; CHECK-NEXT:    [[MUL3_I_I:%.*]] = fmul float [[CONV1_I]], 0.000000e+00
+; CHECK-NEXT:    [[MUL3_I_I:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[CONV1_I]])
 ; CHECK-NEXT:    store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
 ; CHECK-NEXT:    ret float [[MUL3_I_I]]
 ;
@@ -1031,7 +1031,7 @@ define <2 x float> @missed_nonzero_check_on_constant_for_si_fmul_vec(i1 %c, i1 %
 ; CHECK-NEXT:    [[CONV_I_V:%.*]] = insertelement <2 x i16> poison, i16 [[CONV_I_S]], i64 0
 ; CHECK-NEXT:    [[CONV_I:%.*]] = shufflevector <2 x i16> [[CONV_I_V]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[CONV1_I:%.*]] = sitofp <2 x i16> [[CONV_I]] to <2 x float>
-; CHECK-NEXT:    [[MUL3_I_I:%.*]] = fmul <2 x float> [[CONV1_I]], zeroinitializer
+; CHECK-NEXT:    [[MUL3_I_I:%.*]] = call <2 x float> @llvm.copysign.v2f32(<2 x float> zeroinitializer, <2 x float> [[CONV1_I]])
 ; CHECK-NEXT:    store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
 ; CHECK-NEXT:    ret <2 x float> [[MUL3_I_I]]
 ;
@@ -1050,7 +1050,8 @@ define float @negzero_check_on_constant_for_si_fmul(i1 %c, i1 %.b, ptr %g_2345)
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i32 65529, i32 53264
 ; CHECK-NEXT:    [[CONV_I:%.*]] = trunc i32 [[SEL]] to i16
 ; CHECK-NEXT:    [[CONV1_I:%.*]] = sitofp i16 [[CONV_I]] to float
-; CHECK-NEXT:    [[MUL3_I_I:%.*]] = fmul float [[CONV1_I]], -0.000000e+00
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg float [[CONV1_I]]
+; CHECK-NEXT:    [[MUL3_I_I:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP1]])
 ; CHECK-NEXT:    store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
 ; CHECK-NEXT:    ret float [[MUL3_I_I]]
 ;
@@ -1069,7 +1070,7 @@ define <2 x float> @nonzero_check_on_constant_for_si_fmul_vec_w_undef(i1 %c, i1
 ; CHECK-NEXT:    [[CONV_I_V:%.*]] = insertelement <2 x i16> poison, i16 [[CONV_I_S]], i64 0
 ; CHECK-NEXT:    [[CONV_I:%.*]] = shufflevector <2 x i16> [[CONV_I_V]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[CONV1_I:%.*]] = sitofp <2 x i16> [[CONV_I]] to <2 x float>
-; CHECK-NEXT:    [[MUL3_I_I:%.*]] = fmul <2 x float> [[CONV1_I]], <float undef, float 0.000000e+00>
+; CHECK-NEXT:    [[MUL3_I_I:%.*]] = call <2 x float> @llvm.copysign.v2f32(<2 x float> zeroinitializer, <2 x float> [[CONV1_I]])
 ; CHECK-NEXT:    store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
 ; CHECK-NEXT:    ret <2 x float> [[MUL3_I_I]]
 ;
@@ -1111,7 +1112,8 @@ define <2 x float> @nonzero_check_on_constant_for_si_fmul_negz_vec_w_undef(i1 %c
 ; CHECK-NEXT:    [[CONV_I_V:%.*]] = insertelement <2 x i16> poison, i16 [[CONV_I_S]], i64 0
 ; CHECK-NEXT:    [[CONV_I:%.*]] = shufflevector <2 x i16> [[CONV_I_V]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[CONV1_I:%.*]] = sitofp <2 x i16> [[CONV_I]] to <2 x float>
-; CHECK-NEXT:    [[MUL3_I_I:%.*]] = fmul <2 x float> [[CONV1_I]], <float undef, float -0.000000e+00>
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <2 x float> [[CONV1_I]]
+; CHECK-NEXT:    [[MUL3_I_I:%.*]] = call <2 x float> @llvm.copysign.v2f32(<2 x float> zeroinitializer, <2 x float> [[TMP1]])
 ; CHECK-NEXT:    store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
 ; CHECK-NEXT:    ret <2 x float> [[MUL3_I_I]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/div.ll b/llvm/test/Transforms/InstCombine/div.ll
index 1309dee817cf..e8a25ff44d02 100644
--- a/llvm/test/Transforms/InstCombine/div.ll
+++ b/llvm/test/Transforms/InstCombine/div.ll
@@ -1810,3 +1810,25 @@ define i6 @udiv_distribute_mul_nsw_add_nuw(i6 %x) {
   %div = udiv i6 %add, 3
   ret i6 %div
 }
+
+define i32 @fold_disjoint_or_over_sdiv(i32 %x) {
+; CHECK-LABEL: @fold_disjoint_or_over_sdiv(
+; CHECK-NEXT:    [[R:%.*]] = add nsw i32 [[X:%.*]], 9
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %mul = mul nsw i32 %x, 9
+  %or = or disjoint i32 %mul, 81
+  %r = sdiv i32 %or, 9
+  ret i32 %r
+}
+
+define i32 @fold_disjoint_or_over_udiv(i32 %x) {
+; CHECK-LABEL: @fold_disjoint_or_over_udiv(
+; CHECK-NEXT:    [[R:%.*]] = add nuw i32 [[X:%.*]], 9
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %mul = mul nuw i32 %x, 9
+  %or = or disjoint i32 %mul, 81
+  %r = udiv i32 %or, 9
+  ret i32 %r
+}
diff --git a/llvm/test/Transforms/InstCombine/fmul.ll b/llvm/test/Transforms/InstCombine/fmul.ll
index 96e57939d285..f6435f003289 100644
--- a/llvm/test/Transforms/InstCombine/fmul.ll
+++ b/llvm/test/Transforms/InstCombine/fmul.ll
@@ -1250,7 +1250,7 @@ define half @mul_zero_nnan(half %x) {
 
 define <2 x float> @mul_zero_nnan_vec_poison(<2 x float> %x) {
 ; CHECK-LABEL: @mul_zero_nnan_vec_poison(
-; CHECK-NEXT:    [[R:%.*]] = call nnan <2 x float> @llvm.copysign.v2f32(<2 x float> <float 0.000000e+00, float poison>, <2 x float> [[X:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = call nnan <2 x float> @llvm.copysign.v2f32(<2 x float> zeroinitializer, <2 x float> [[X:%.*]])
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
   %r = fmul nnan <2 x float> %x, <float 0.0, float poison>
@@ -1268,13 +1268,104 @@ define half @mul_zero(half %x) {
   ret half %r
 }
 
-; TODO: This could be fneg+copysign.
-
 define half @mul_negzero_nnan(half %x) {
 ; CHECK-LABEL: @mul_negzero_nnan(
-; CHECK-NEXT:    [[R:%.*]] = fmul nnan half [[X:%.*]], 0xH8000
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg nnan half [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = call nnan half @llvm.copysign.f16(half 0xH0000, half [[TMP1]])
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %r = fmul nnan half %x, -0.0
   ret half %r
 }
+
+define float @mul_pos_zero_nnan_ninf(float nofpclass(inf nan) %a) {
+; CHECK-LABEL: @mul_pos_zero_nnan_ninf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[A:%.*]])
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  %ret = fmul float %a, 0.000000e+00
+  ret float %ret
+}
+
+define float @mul_pos_zero_nnan(float nofpclass(nan) %a) {
+; CHECK-LABEL: @mul_pos_zero_nnan(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = fmul float [[A:%.*]], 0.000000e+00
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  %ret = fmul float %a, 0.000000e+00
+  ret float %ret
+}
+
+define float @mul_pos_zero_nnan_ninf_fmf(float nofpclass(nan) %a) {
+; CHECK-LABEL: @mul_pos_zero_nnan_ninf_fmf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = call ninf float @llvm.copysign.f32(float 0.000000e+00, float [[A:%.*]])
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  %ret = fmul ninf float %a, 0.000000e+00
+  ret float %ret
+}
+
+define float @mul_neg_zero_nnan_ninf(float nofpclass(inf nan) %a) {
+; CHECK-LABEL: @mul_neg_zero_nnan_ninf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = fneg float [[A:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP0]])
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  %ret = fmul float %a, -0.000000e+00
+  ret float %ret
+}
+
+define float @mul_neg_zero_nnan_fmf(float %a) {
+; CHECK-LABEL: @mul_neg_zero_nnan_fmf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = fneg nnan float [[A:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = call nnan float @llvm.copysign.f32(float 0.000000e+00, float [[TMP0]])
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  %ret = fmul nnan float %a, -0.000000e+00
+  ret float %ret
+}
+
+define float @mul_neg_zero_nnan_ninf_fmf(float nofpclass(inf nan) %a) {
+; CHECK-LABEL: @mul_neg_zero_nnan_ninf_fmf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = fneg nnan ninf float [[A:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = call nnan ninf float @llvm.copysign.f32(float 0.000000e+00, float [[TMP0]])
+; CHECK-NEXT:    ret float [[RET]]
+;
+entry:
+  %ret = fmul nnan ninf float %a, -0.000000e+00
+  ret float %ret
+}
+
+define <3 x float> @mul_neg_zero_nnan_ninf_vec(<3 x float> nofpclass(inf nan) %a) {
+; CHECK-LABEL: @mul_neg_zero_nnan_ninf_vec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = fneg <3 x float> [[A:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = call <3 x float> @llvm.copysign.v3f32(<3 x float> zeroinitializer, <3 x float> [[TMP0]])
+; CHECK-NEXT:    ret <3 x float> [[RET]]
+;
+entry:
+  %ret = fmul <3 x float> %a, <float -0.0, float undef, float poison>
+  ret <3 x float> %ret
+}
+
+define <3 x float> @mul_mixed_zero_nnan_ninf_vec(<3 x float> nofpclass(inf nan) %a) {
+; CHECK-LABEL: @mul_mixed_zero_nnan_ninf_vec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RET:%.*]] = fmul <3 x float> [[A:%.*]], <float -0.000000e+00, float 0.000000e+00, float poison>
+; CHECK-NEXT:    ret <3 x float> [[RET]]
+;
+entry:
+  %ret = fmul <3 x float> %a, <float -0.0, float 0.0, float poison>
+  ret <3 x float> %ret
+}
diff --git a/llvm/test/Transforms/InstCombine/fpcast.ll b/llvm/test/Transforms/InstCombine/fpcast.ll
index 32bfdb52bb5f..ac4b88fcddd7 100644
--- a/llvm/test/Transforms/InstCombine/fpcast.ll
+++ b/llvm/test/Transforms/InstCombine/fpcast.ll
@@ -424,10 +424,7 @@ define i32 @fptosi_select(i1 %cond) {
 define i32 @mul_pos_zero_convert(i32 %a) {
 ; CHECK-LABEL: @mul_pos_zero_convert(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[FP:%.*]] = sitofp i32 [[A:%.*]] to float
-; CHECK-NEXT:    [[RET:%.*]] = fmul float [[FP]], 0.000000e+00
-; CHECK-NEXT:    [[CONV:%.*]] = fptosi float [[RET]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
+; CHECK-NEXT:    ret i32 0
 ;
 entry:
   %fp = sitofp i32 %a to float
diff --git a/llvm/test/Transforms/InstCombine/sadd-with-overflow.ll b/llvm/test/Transforms/InstCombine/sadd-with-overflow.ll
index 4b37ccbe3370..729ca03ddfd1 100644
--- a/llvm/test/Transforms/InstCombine/sadd-with-overflow.ll
+++ b/llvm/test/Transforms/InstCombine/sadd-with-overflow.ll
@@ -122,3 +122,35 @@ define { i32, i1 } @fold_sub_simple(i32 %x) {
   %b = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 30)
   ret { i32, i1 } %b
 }
+
+define { i32, i1 } @fold_with_distjoin_or(i32 %x) {
+; CHECK-LABEL: @fold_with_distjoin_or(
+; CHECK-NEXT:    [[B:%.*]] = add i32 [[X:%.*]], 6
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { i32, i1 } { i32 poison, i1 false }, i32 [[B]], 0
+; CHECK-NEXT:    ret { i32, i1 } [[TMP1]]
+;
+  %a = or disjoint i32 %x, 13
+  %b = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 -7)
+  ret { i32, i1 } %b
+}
+
+define { i32, i1 } @fold_with_disjoint_or2(i32 %x) {
+; CHECK-LABEL: @fold_with_disjoint_or2(
+; CHECK-NEXT:    [[B:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[X:%.*]], i32 127)
+; CHECK-NEXT:    ret { i32, i1 } [[B]]
+;
+  %a = or disjoint i32 %x, 100
+  %b = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 27)
+  ret { i32, i1 } %b
+}
+
+define { i32, i1 } @fold_with_or_fail(i32 %x) {
+; CHECK-LABEL: @fold_with_or_fail(
+; CHECK-NEXT:    [[A:%.*]] = or i32 [[X:%.*]], 100
+; CHECK-NEXT:    [[B:%.*]] = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A]], i32 27)
+; CHECK-NEXT:    ret { i32, i1 } [[B]]
+;
+  %a = or i32 %x, 100
+  %b = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 27)
+  ret { i32, i1 } %b
+}
diff --git a/llvm/test/Transforms/InstCombine/shift-add.ll b/llvm/test/Transforms/InstCombine/shift-add.ll
index 1b2567505993..aa3a238e0949 100644
--- a/llvm/test/Transforms/InstCombine/shift-add.ll
+++ b/llvm/test/Transforms/InstCombine/shift-add.ll
@@ -775,3 +775,32 @@ define <3 x i32> @add3_i96(<3 x i32> %0, <3 x i32> %1) {
   %25 = insertelement <3 x i32> %24, i32 %20, i32 2
   ret <3 x i32> %25
 }
+
+define i8 @shl_fold_or_disjoint_cnt(i8 %x) {
+; CHECK-LABEL: @shl_fold_or_disjoint_cnt(
+; CHECK-NEXT:    [[R:%.*]] = shl i8 16, [[X:%.*]]
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %a = or disjoint i8 %x, 3
+  %r = shl i8 2, %a
+  ret i8 %r
+}
+
+define <2 x i8> @ashr_fold_or_disjoint_cnt(<2 x i8> %x) {
+; CHECK-LABEL: @ashr_fold_or_disjoint_cnt(
+; CHECK-NEXT:    [[R:%.*]] = lshr <2 x i8> <i8 0, i8 1>, [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i8> [[R]]
+;
+  %a = or disjoint <2 x i8> %x, <i8 3, i8 1>
+  %r = ashr <2 x i8> <i8 2, i8 3>, %a
+  ret <2 x i8> %r
+}
+
+define <2 x i8> @lshr_fold_or_disjoint_cnt_out_of_bounds(<2 x i8> %x) {
+; CHECK-LABEL: @lshr_fold_or_disjoint_cnt_out_of_bounds(
+; CHECK-NEXT:    ret <2 x i8> zeroinitializer
+;
+  %a = or disjoint <2 x i8> %x, <i8 3, i8 8>
+  %r = lshr <2 x i8> <i8 2, i8 3>, %a
+  ret <2 x i8> %r
+}
diff --git a/llvm/test/Transforms/InstCombine/shuffle_select-inseltpoison.ll b/llvm/test/Transforms/InstCombine/shuffle_select-inseltpoison.ll
index 44ec77e471bb..f573ff36d2ce 100644
--- a/llvm/test/Transforms/InstCombine/shuffle_select-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/shuffle_select-inseltpoison.ll
@@ -336,7 +336,18 @@ define <4 x i32> @srem(<4 x i32> %v) {
 
 ; Try FP ops/types.
 
-define <4 x float> @fadd(<4 x float> %v) {
+define <4 x float> @fadd_maybe_nan(<4 x float> %v) {
+; CHECK-LABEL: @fadd_maybe_nan(
+; CHECK-NEXT:    [[B:%.*]] = fadd <4 x float> [[V:%.*]], <float 4.100000e+01, float 4.200000e+01, float poison, float poison>
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x float> [[B]], <4 x float> [[V]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[S]]
+;
+  %b = fadd <4 x float> %v, <float 41.0, float 42.0, float 43.0, float 44.0>
+  %s = shufflevector <4 x float> %b, <4 x float> %v, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x float> %s
+}
+
+define <4 x float> @fadd(<4 x float> nofpclass(nan) %v) {
 ; CHECK-LABEL: @fadd(
 ; CHECK-NEXT:    [[S:%.*]] = fadd <4 x float> [[V:%.*]], <float 4.100000e+01, float 4.200000e+01, float -0.000000e+00, float -0.000000e+00>
 ; CHECK-NEXT:    ret <4 x float> [[S]]
@@ -359,7 +370,7 @@ define <4 x double> @fsub(<4 x double> %v) {
 
 ; Propagate any FMF.
 
-define <4 x float> @fmul(<4 x float> %v) {
+define <4 x float> @fmul(<4 x float> nofpclass(nan) %v) {
 ; CHECK-LABEL: @fmul(
 ; CHECK-NEXT:    [[S:%.*]] = fmul nnan ninf <4 x float> [[V:%.*]], <float 4.100000e+01, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    ret <4 x float> [[S]]
@@ -380,7 +391,7 @@ define <4 x double> @fdiv_constant_op0(<4 x double> %v) {
   ret <4 x double> %s
 }
 
-define <4 x double> @fdiv_constant_op1(<4 x double> %v) {
+define <4 x double> @fdiv_constant_op1(<4 x double> nofpclass(nan) %v) {
 ; CHECK-LABEL: @fdiv_constant_op1(
 ; CHECK-NEXT:    [[S:%.*]] = fdiv reassoc <4 x double> [[V:%.*]], <double undef, double 1.000000e+00, double 4.300000e+01, double 4.400000e+01>
 ; CHECK-NEXT:    ret <4 x double> [[S]]
diff --git a/llvm/test/Transforms/InstCombine/shuffle_select.ll b/llvm/test/Transforms/InstCombine/shuffle_select.ll
index a1b0d782b554..efadb5c3c109 100644
--- a/llvm/test/Transforms/InstCombine/shuffle_select.ll
+++ b/llvm/test/Transforms/InstCombine/shuffle_select.ll
@@ -336,7 +336,18 @@ define <4 x i32> @srem(<4 x i32> %v) {
 
 ; Try FP ops/types.
 
-define <4 x float> @fadd(<4 x float> %v) {
+define <4 x float> @fadd_maybe_nan(<4 x float> %v) {
+; CHECK-LABEL: @fadd_maybe_nan(
+; CHECK-NEXT:    [[B:%.*]] = fadd <4 x float> [[V:%.*]], <float 4.100000e+01, float 4.200000e+01, float poison, float poison>
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x float> [[B]], <4 x float> [[V]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[S]]
+;
+  %b = fadd <4 x float> %v, <float 41.0, float 42.0, float 43.0, float 44.0>
+  %s = shufflevector <4 x float> %b, <4 x float> %v, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+  ret <4 x float> %s
+}
+
+define <4 x float> @fadd(<4 x float> nofpclass(nan) %v) {
 ; CHECK-LABEL: @fadd(
 ; CHECK-NEXT:    [[S:%.*]] = fadd <4 x float> [[V:%.*]], <float 4.100000e+01, float 4.200000e+01, float -0.000000e+00, float -0.000000e+00>
 ; CHECK-NEXT:    ret <4 x float> [[S]]
@@ -359,7 +370,7 @@ define <4 x double> @fsub(<4 x double> %v) {
 
 ; Propagate any FMF.
 
-define <4 x float> @fmul(<4 x float> %v) {
+define <4 x float> @fmul(<4 x float> nofpclass(nan) %v) {
 ; CHECK-LABEL: @fmul(
 ; CHECK-NEXT:    [[S:%.*]] = fmul nnan ninf <4 x float> [[V:%.*]], <float 4.100000e+01, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    ret <4 x float> [[S]]
@@ -380,7 +391,7 @@ define <4 x double> @fdiv_constant_op0(<4 x double> %v) {
   ret <4 x double> %s
 }
 
-define <4 x double> @fdiv_constant_op1(<4 x double> %v) {
+define <4 x double> @fdiv_constant_op1(<4 x double> nofpclass(nan) %v) {
 ; CHECK-LABEL: @fdiv_constant_op1(
 ; CHECK-NEXT:    [[S:%.*]] = fdiv reassoc <4 x double> [[V:%.*]], <double undef, double 1.000000e+00, double 4.300000e+01, double 4.400000e+01>
 ; CHECK-NEXT:    ret <4 x double> [[S]]
diff --git a/llvm/test/Transforms/InstCombine/uadd-with-overflow.ll b/llvm/test/Transforms/InstCombine/uadd-with-overflow.ll
index 28d309baaa41..fd5d38bb38dd 100644
--- a/llvm/test/Transforms/InstCombine/uadd-with-overflow.ll
+++ b/llvm/test/Transforms/InstCombine/uadd-with-overflow.ll
@@ -124,3 +124,26 @@ define { i32, i1 } @no_fold_wrapped_add(i32 %x) {
   %b = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 30, i32 %a)
   ret { i32, i1 } %b
 }
+
+
+define { <2 x i32>, <2 x i1> } @fold_simple_splat_with_disjoint_or_constant(<2 x i32> %x) {
+; CHECK-LABEL: @fold_simple_splat_with_disjoint_or_constant(
+; CHECK-NEXT:    [[B:%.*]] = call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> [[X:%.*]], <2 x i32> <i32 42, i32 42>)
+; CHECK-NEXT:    ret { <2 x i32>, <2 x i1> } [[B]]
+;
+  %a = or disjoint <2 x i32> %x, <i32 12, i32 12>
+  %b = tail call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> <i32 30, i32 30>)
+  ret { <2 x i32>, <2 x i1> } %b
+}
+
+
+define { <2 x i32>, <2 x i1> } @fold_simple_splat_constant_with_or_fail(<2 x i32> %x) {
+; CHECK-LABEL: @fold_simple_splat_constant_with_or_fail(
+; CHECK-NEXT:    [[A:%.*]] = or <2 x i32> [[X:%.*]], <i32 12, i32 12>
+; CHECK-NEXT:    [[B:%.*]] = tail call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> [[A]], <2 x i32> <i32 30, i32 30>)
+; CHECK-NEXT:    ret { <2 x i32>, <2 x i1> } [[B]]
+;
+  %a = or <2 x i32> %x, <i32 12, i32 12>
+  %b = tail call { <2 x i32>, <2 x i1> } @llvm.uadd.with.overflow.v2i32(<2 x i32> %a, <2 x i32> <i32 30, i32 30>)
+  ret { <2 x i32>, <2 x i1> } %b
+}
diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll b/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll
index e6a0c5f45375..daa64f2e2ea7 100644
--- a/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll
+++ b/llvm/test/Transforms/LoopIdiom/AArch64/byte-compare-index.ll
@@ -2,6 +2,9 @@
 ; RUN: opt -aarch64-lit -aarch64-lit-verify -verify-dom-info -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s
 ; RUN: opt -aarch64-lit -simplifycfg -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s --check-prefix=LOOP-DEL
 ; RUN: opt -aarch64-lit -mtriple aarch64-unknown-linux-gnu -S < %s | FileCheck %s --check-prefix=NO-TRANSFORM
+; RUN: opt -p aarch64-lit -aarch64-lit-verify -verify-dom-info -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s
+; RUN: opt -passes='function(loop(aarch64-lit)),simplifycfg' -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s --check-prefix=LOOP-DEL
+; RUN: opt -p aarch64-lit -mtriple aarch64-unknown-linux-gnu -S < %s | FileCheck %s --check-prefix=NO-TRANSFORM
 
 define i32 @compare_bytes_simple(ptr %a, ptr %b, i32 %len, i32 %extra, i32 %n) {
 ; CHECK-LABEL: define i32 @compare_bytes_simple(
@@ -780,7 +783,7 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
 ; CHECK-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[MISMATCH_END]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY:%.*]] ]
 ; CHECK-NEXT:    [[INC:%.*]] = add i32 [[LEN_ADDR]], 1
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[MISMATCH_RESULT]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
 ; CHECK:       while.body:
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MISMATCH_RESULT]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
@@ -788,11 +791,14 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
 ; CHECK-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP45]], [[TMP46]]
-; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; CHECK-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END_LOOPEXIT]]
 ; CHECK:       byte.compare:
+; CHECK-NEXT:    br label [[WHILE_END_LOOPEXIT]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    [[INC_LCSSA1:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
 ; CHECK-NEXT:    br label [[WHILE_END]]
 ; CHECK:       while.end:
-; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[MISMATCH_RESULT]], [[WHILE_BODY]] ], [ [[MISMATCH_RESULT]], [[WHILE_COND]] ], [ [[X]], [[ENTRY:%.*]] ], [ [[MISMATCH_RESULT]], [[BYTE_COMPARE]] ]
+; CHECK-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[INC_LCSSA1]], [[WHILE_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    ret i32 [[INC_LCSSA]]
 ;
 ; LOOP-DEL-LABEL: define i32 @compare_bytes_extra_cmp(
@@ -884,7 +890,7 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
 ; NO-TRANSFORM-NEXT:    [[LEN_ADDR:%.*]] = phi i32 [ [[LEN]], [[PH]] ], [ [[INC:%.*]], [[WHILE_BODY:%.*]] ]
 ; NO-TRANSFORM-NEXT:    [[INC]] = add i32 [[LEN_ADDR]], 1
 ; NO-TRANSFORM-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
-; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]]
+; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
 ; NO-TRANSFORM:       while.body:
 ; NO-TRANSFORM-NEXT:    [[IDXPROM:%.*]] = zext i32 [[INC]] to i64
 ; NO-TRANSFORM-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IDXPROM]]
@@ -892,9 +898,12 @@ define i32 @compare_bytes_extra_cmp(ptr %a, ptr %b, i32 %len, i32 %n, i32 %x) {
 ; NO-TRANSFORM-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[IDXPROM]]
 ; NO-TRANSFORM-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
 ; NO-TRANSFORM-NEXT:    [[CMP_NOT2:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
-; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END]]
+; NO-TRANSFORM-NEXT:    br i1 [[CMP_NOT2]], label [[WHILE_COND]], label [[WHILE_END_LOOPEXIT]]
+; NO-TRANSFORM:       while.end.loopexit:
+; NO-TRANSFORM-NEXT:    [[INC_LCSSA1:%.*]] = phi i32 [ [[INC]], [[WHILE_COND]] ], [ [[INC]], [[WHILE_BODY]] ]
+; NO-TRANSFORM-NEXT:    br label [[WHILE_END]]
 ; NO-TRANSFORM:       while.end:
-; NO-TRANSFORM-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[WHILE_BODY]] ], [ [[INC]], [[WHILE_COND]] ], [ [[X]], [[ENTRY:%.*]] ]
+; NO-TRANSFORM-NEXT:    [[INC_LCSSA:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ [[INC_LCSSA1]], [[WHILE_END_LOOPEXIT]] ]
 ; NO-TRANSFORM-NEXT:    ret i32 [[INC_LCSSA]]
 ;
 entry:
@@ -908,7 +917,7 @@ while.cond:
   %len.addr = phi i32 [ %len, %ph ], [ %inc, %while.body ]
   %inc = add i32 %len.addr, 1
   %cmp.not = icmp eq i32 %inc, %n
-  br i1 %cmp.not, label %while.end, label %while.body
+  br i1 %cmp.not, label %while.end.loopexit, label %while.body
 
 while.body:
   %idxprom = zext i32 %inc to i64
@@ -917,10 +926,14 @@ while.body:
   %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %idxprom
   %1 = load i8, ptr %arrayidx2
   %cmp.not2 = icmp eq i8 %0, %1
-  br i1 %cmp.not2, label %while.cond, label %while.end
+  br i1 %cmp.not2, label %while.cond, label %while.end.loopexit
+
+while.end.loopexit:
+  %inc.lcssa1 = phi i32 [ %inc, %while.cond ], [ %inc, %while.body ]
+  br label %while.end
 
 while.end:
-  %inc.lcssa = phi i32 [ %inc, %while.body ], [ %inc, %while.cond ], [ %x, %entry ]
+  %inc.lcssa = phi i32 [ %x, %entry ], [ %inc.lcssa1, %while.end.loopexit ]
   ret i32 %inc.lcssa
 }
 
diff --git a/llvm/test/Transforms/PGOProfile/memop_profile_funclet_wasm.ll b/llvm/test/Transforms/PGOProfile/memop_profile_funclet_wasm.ll
new file mode 100644
index 000000000000..f8dcb768c94c
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/memop_profile_funclet_wasm.ll
@@ -0,0 +1,48 @@
+; RUN: opt < %s -passes=pgo-instr-gen -S | FileCheck %s --check-prefixes=CHECK,GEN
+; RUN: opt < %s -passes=pgo-instr-gen,instrprof -S | FileCheck %s --check-prefixes=CHECK,LOWER
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+define void @wasm_funclet_op_bundle(ptr %p, ptr %dst, ptr %src) personality ptr @__gxx_wasm_personality_v0 {
+entry:
+  invoke void @foo()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %0 = catchswitch within none [label %catch.start] unwind to caller
+
+catch.start:                                      ; preds = %catch.dispatch
+  %1 = catchpad within %0 [ptr null]
+; CHECK: %[[CATCHPAD:.*]] = catchpad
+  %2 = call ptr @llvm.wasm.get.exception(token %1)
+  %3 = call i32 @llvm.wasm.get.ehselector(token %1)
+  %4 = call ptr @__cxa_begin_catch(ptr %2) #3 [ "funclet"(token %1) ]
+  %tmp = load i32, ptr %p, align 4
+  call void @llvm.memcpy.p0.p0.i32(ptr %dst, ptr %src, i32 %tmp, i1 false)
+; GEN: call void @llvm.instrprof.value.profile({{.*}}) [ "funclet"(token %[[CATCHPAD]]) ]
+; LOWER: call void @__llvm_profile_instrument_memop({{.*}}) [ "funclet"(token %[[CATCHPAD]]) ]
+  call void @__cxa_end_catch() [ "funclet"(token %1) ]
+  catchret from %1 to label %try.cont
+
+try.cont:                                         ; preds = %catch.start, %entry
+  ret void
+}
+
+declare void @foo()
+declare i32 @__gxx_wasm_personality_v0(...)
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare ptr @llvm.wasm.get.exception(token) #0
+; Function Attrs: nocallback nofree nosync nounwind willreturn
+declare i32 @llvm.wasm.get.ehselector(token) #0
+; Function Attrs: nounwind memory(none)
+declare i32 @llvm.eh.typeid.for(ptr) #1
+declare ptr @__cxa_begin_catch(ptr)
+declare void @__cxa_end_catch()
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg) #2
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn }
+attributes #1 = { nounwind memory(none) }
+attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+attributes #3 = { nounwind }
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll
index e61b254b7a5f..495ec0a63399 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr67803.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s
 
 define <4 x i64> @PR67803(<4 x i64> %x, <4 x i64> %y, <4 x i64> %a, <4 x i64> %b) {
 ; CHECK-LABEL: @PR67803(
@@ -11,16 +11,14 @@ define <4 x i64> @PR67803(<4 x i64> %x, <4 x i64> %y, <4 x i64> %a, <4 x i64> %b
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[CMP_I21:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[SEXT_I22:%.*]] = sext <4 x i1> [[CMP_I21]] to <4 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[SEXT_I22]] to <2 x i64>
 ; CHECK-NEXT:    [[CMP_I:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[SEXT_I]] to <2 x i64>
-; CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[SEXT_I22]], <4 x i32> [[SEXT_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <32 x i8> [[TMP5]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <32 x i8> [[TMP7]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64> [[SHUFFLE_I]] to <32 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <32 x i8> [[TMP9]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP11:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[TMP6]], <16 x i8> [[TMP8]], <16 x i8> [[TMP10]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
@@ -28,42 +26,13 @@ define <4 x i64> @PR67803(<4 x i64> %x, <4 x i64> %y, <4 x i64> %a, <4 x i64> %b
 ; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <32 x i8> [[TMP13]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i64> [[B]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <32 x i8> [[TMP15]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i64> [[SHUFFLE_I]] to <32 x i8>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
 ; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <32 x i8> [[TMP17]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP19:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[TMP14]], <16 x i8> [[TMP16]], <16 x i8> [[TMP18]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i8> [[TMP19]] to <2 x i64>
 ; CHECK-NEXT:    [[SHUFFLE_I23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x i64> [[SHUFFLE_I23]]
 ;
-; AVX512-LABEL: @PR67803(
-; AVX512-NEXT:  entry:
-; AVX512-NEXT:    [[TMP0:%.*]] = bitcast <4 x i64> [[X:%.*]] to <8 x i32>
-; AVX512-NEXT:    [[TMP1:%.*]] = bitcast <4 x i64> [[Y:%.*]] to <8 x i32>
-; AVX512-NEXT:    [[TMP2:%.*]] = icmp sgt <8 x i32> [[TMP0]], [[TMP1]]
-; AVX512-NEXT:    [[CMP_I21:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:    [[SEXT_I22:%.*]] = sext <4 x i1> [[CMP_I21]] to <4 x i32>
-; AVX512-NEXT:    [[CMP_I:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
-; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[SEXT_I22]], <4 x i32> [[SEXT_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX512-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[A:%.*]] to <32 x i8>
-; AVX512-NEXT:    [[TMP5:%.*]] = shufflevector <32 x i8> [[TMP4]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    [[TMP6:%.*]] = bitcast <4 x i64> [[B:%.*]] to <32 x i8>
-; AVX512-NEXT:    [[TMP7:%.*]] = shufflevector <32 x i8> [[TMP6]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; AVX512-NEXT:    [[TMP9:%.*]] = shufflevector <32 x i8> [[TMP8]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX512-NEXT:    [[TMP10:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[TMP5]], <16 x i8> [[TMP7]], <16 x i8> [[TMP9]])
-; AVX512-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
-; AVX512-NEXT:    [[TMP12:%.*]] = bitcast <4 x i64> [[A]] to <32 x i8>
-; AVX512-NEXT:    [[TMP13:%.*]] = shufflevector <32 x i8> [[TMP12]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:    [[TMP14:%.*]] = bitcast <4 x i64> [[B]] to <32 x i8>
-; AVX512-NEXT:    [[TMP15:%.*]] = shufflevector <32 x i8> [[TMP14]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:    [[TMP16:%.*]] = bitcast <8 x i32> [[TMP3]] to <32 x i8>
-; AVX512-NEXT:    [[TMP17:%.*]] = shufflevector <32 x i8> [[TMP16]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX512-NEXT:    [[TMP18:%.*]] = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> [[TMP13]], <16 x i8> [[TMP15]], <16 x i8> [[TMP17]])
-; AVX512-NEXT:    [[TMP19:%.*]] = bitcast <16 x i8> [[TMP18]] to <2 x i64>
-; AVX512-NEXT:    [[SHUFFLE_I23:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> [[TMP19]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX512-NEXT:    ret <4 x i64> [[SHUFFLE_I23]]
-;
 entry:
   %0 = bitcast <4 x i64> %x to <8 x i32>
   %extract = shufflevector <8 x i32> %0, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-nodes-different-bb.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-nodes-different-bb.ll
new file mode 100644
index 000000000000..2acbe89b9775
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-nodes-different-bb.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux -mattr="-avx512pf,+avx512f,+avx512bw" -slp-threshold=-100 < %s | FileCheck %s
+
+define i1 @foo(i32 %a) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = sub nsw i32 0, [[A:%.*]]
+; CHECK-NEXT:    br label [[BB4:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[LOCAL:%.*]] = sub nsw i32 0, 0
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[ADD:%.*]] = icmp eq i32 [[TMP0]], [[LOCAL]]
+; CHECK-NEXT:    ret i1 [[ADD]]
+;
+entry:
+  %0 = sub nsw i32 0, %a
+  br label %bb1
+
+bb1:
+  %local = sub nsw i32 0, 0
+  %ins1 = insertelement <2 x i32> poison, i32 %0, i32 0
+  %add = icmp eq i32 %0, %local
+  ret i1 %add
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/store-abs-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/store-abs-minbitwidth.ll
new file mode 100644
index 000000000000..e8b854b7cea6
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/store-abs-minbitwidth.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx512vl -passes=slp-vectorizer -slp-threshold=-3 | FileCheck %s
+
+
+define i32 @test(ptr noalias %in, ptr noalias %inn, ptr %out) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, ptr [[IN:%.*]], align 1
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i8>, ptr [[GEP_2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i8>, ptr [[INN:%.*]], align 1
+; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i8, ptr [[INN]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i8>, ptr [[GEP_5]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <4 x i8> [[TMP7]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP12:%.*]] = sext <4 x i8> [[TMP11]] to <4 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = sub <4 x i32> [[TMP12]], [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP13]], i1 true)
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i32> [[TMP14]] to <4 x i16>
+; CHECK-NEXT:    store <4 x i16> [[TMP15]], ptr [[OUT:%.*]], align 2
+; CHECK-NEXT:    ret i32 undef
+;
+  %load.1 = load i8, ptr %in, align 1
+  %gep.1 = getelementptr inbounds i8, ptr %in, i64 1
+  %load.2 = load i8, ptr %gep.1, align 1
+  %gep.2 = getelementptr inbounds i8, ptr %in, i64 2
+  %load.3 = load i8, ptr %gep.2, align 1
+  %gep.3 = getelementptr inbounds i8, ptr %in, i64 3
+  %load.4 = load i8, ptr %gep.3, align 1
+  %load.5 = load i8, ptr %inn, align 1
+  %gep.4 = getelementptr inbounds i8, ptr %inn, i64 1
+  %load.6 = load i8, ptr %gep.4, align 1
+  %gep.5 = getelementptr inbounds i8, ptr %inn, i64 2
+  %load.7 = load i8, ptr %gep.5, align 1
+  %gep.6 = getelementptr inbounds i8, ptr %inn, i64 3
+  %load.8 = load i8, ptr %gep.6, align 1
+  %sext1 = sext i8 %load.1 to i32
+  %sext2 = sext i8 %load.2 to i32
+  %sext3 = sext i8 %load.3 to i32
+  %sext4 = sext i8 %load.4 to i32
+  %sext5 = sext i8 %load.5 to i32
+  %sext6 = sext i8 %load.6 to i32
+  %sext7 = sext i8 %load.7 to i32
+  %sext8 = sext i8 %load.8 to i32
+  %sub1 = sub i32 %sext1, %sext5
+  %sub2 = sub i32 %sext2, %sext6
+  %sub3 = sub i32 %sext7, %sext3
+  %sub4 = sub i32 %sext8, %sext4
+  %call1 = call i32 @llvm.abs(i32 %sub1, i1 true)
+  %call2 = call i32 @llvm.abs(i32 %sub2, i1 true)
+  %call3 = call i32 @llvm.abs(i32 %sub3, i1 true)
+  %call4 = call i32 @llvm.abs(i32 %sub4, i1 true)
+  %t1 = trunc i32 %call1 to i16
+  %t2 = trunc i32 %call2 to i16
+  %t3 = trunc i32 %call3 to i16
+  %t4 = trunc i32 %call4 to i16
+  %gep.8 = getelementptr inbounds i16, ptr %out, i64 1
+  %gep.9 = getelementptr inbounds i16, ptr %out, i64 2
+  %gep.10 = getelementptr inbounds i16, ptr %out, i64 3
+  store i16 %t1, ptr %out, align 2
+  store i16 %t2, ptr %gep.8, align 2
+  store i16 %t3, ptr %gep.9, align 2
+  store i16 %t4, ptr %gep.10, align 2
+
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/orig-btiwidth-les-projected.ll b/llvm/test/Transforms/SLPVectorizer/orig-btiwidth-les-projected.ll
new file mode 100644
index 000000000000..531e96405348
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/orig-btiwidth-les-projected.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer < %s | FileCheck %s
+
+define i32 @test(i4 %0) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: i4 [[TMP0:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i8 0 to i4
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i8 0 to i4
+; CHECK-NEXT:    [[ADD_R:%.*]] = or i4 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[ADD_R14:%.*]] = or i4 0, [[TMP2]]
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i4 [[ADD_R]], [[ADD_R14]]
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %1 = trunc i8 0 to i4
+  %2 = trunc i8 0 to i4
+  %add.r = or i4 %1, %0
+  %add.r14 = or i4 0, %2
+  %cmp.not = icmp eq i4 %add.r, %add.r14
+  ret i32 0
+}
diff --git a/llvm/test/tools/llvm-lib/arm64ec-implib.test b/llvm/test/tools/llvm-lib/arm64ec-implib.test
index 00eddd2a4752..9ce53fe0fea0 100644
--- a/llvm/test/tools/llvm-lib/arm64ec-implib.test
+++ b/llvm/test/tools/llvm-lib/arm64ec-implib.test
@@ -34,17 +34,17 @@ ARMAP-NEXT: test_NULL_THUNK_DATA in test.dll
 
 RUN: llvm-readobj test.lib | FileCheck -check-prefix=READOBJ %s
 
-READOBJ:      File: test.lib(test.dll)
+READOBJ:      File: test{{.*}}.lib(test.dll)
 READOBJ-NEXT: Format: COFF-ARM64{{$}}
 READOBJ-NEXT: Arch: aarch64
 READOBJ-NEXT: AddressSize: 64bit
 READOBJ-EMPTY:
-READOBJ-NEXT: File: test.lib(test.dll)
+READOBJ-NEXT: File: test{{.*}}.lib(test.dll)
 READOBJ-NEXT: Format: COFF-ARM64{{$}}
 READOBJ-NEXT: Arch: aarch64
 READOBJ-NEXT: AddressSize: 64bit
 READOBJ-EMPTY:
-READOBJ-NEXT: File: test.lib(test.dll)
+READOBJ-NEXT: File: test{{.*}}.lib(test.dll)
 READOBJ-NEXT: Format: COFF-ARM64{{$}}
 READOBJ-NEXT: Arch: aarch64
 READOBJ-NEXT: AddressSize: 64bit
@@ -96,6 +96,11 @@ READOBJ-NEXT: Name type: name
 READOBJ-NEXT: Export name: dataexp
 READOBJ-NEXT: Symbol: __imp_dataexp
 
+Using -machine:arm64x gives the same output.
+RUN: llvm-lib -machine:arm64x -def:test.def -out:testx.lib
+RUN: llvm-nm --print-armap testx.lib | FileCheck -check-prefix=ARMAP %s
+RUN: llvm-readobj testx.lib | FileCheck -check-prefix=READOBJ %s
+
 Creating a new lib containing the existing lib:
 RUN: llvm-lib -machine:arm64ec test.lib -out:test2.lib
 RUN: llvm-nm --print-armap test2.lib | FileCheck -check-prefix=ARMAP %s
@@ -246,7 +251,9 @@ READOBJX-NEXT: Symbol: __imp_dataexp
 
 
 RUN: llvm-lib -machine:arm64ec -def:test.def -defArm64Native:test2.def -out:test2.lib
+RUN: llvm-lib -machine:arm64ec -def:test.def -defArm64Native:test2.def -out:test2x.lib
 RUN: llvm-nm --print-armap test2.lib | FileCheck -check-prefix=ARMAPX2 %s
+RUN: llvm-nm --print-armap test2x.lib | FileCheck -check-prefix=ARMAPX2 %s
 
 ARMAPX2:      Archive map
 ARMAPX2-NEXT: __IMPORT_DESCRIPTOR_test2 in test2.dll
diff --git a/llvm/test/tools/llvm-objcopy/ELF/skip-symbol.test b/llvm/test/tools/llvm-objcopy/ELF/skip-symbol.test
new file mode 100644
index 000000000000..0f3ab808482b
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/ELF/skip-symbol.test
@@ -0,0 +1,100 @@
+## This test checks the functionality of options --skip-symbol and --skip-symbols.
+# RUN: yaml2obj %s -o %t.o
+# RUN: echo 'foo[2-3]' > %t.skip.regex
+
+## Check --skip-symbol functionality when changing symbol bindings.
+# RUN: llvm-objcopy %t.o %t2.o --localize-hidden --skip-symbol=foo3
+# RUN: llvm-readelf -s %t2.o | FileCheck %s --check-prefix=LH-SYM
+# LH-SYM-DAG: LOCAL  HIDDEN      1 foo1
+# LH-SYM-DAG: LOCAL  HIDDEN      1 foo2
+# LH-SYM-DAG: GLOBAL HIDDEN      1 foo3
+# LH-SYM-DAG: LOCAL  HIDDEN      1 foo4
+# LH-SYM-DAG: LOCAL  HIDDEN      1 foo5
+
+## Check --skip-symbols functionality when changing symbol bindings.
+# RUN: llvm-objcopy %t.o %t1.o --localize-hidden --skip-symbols=%t.skip.regex --regex
+# RUN: llvm-readelf -s %t1.o | FileCheck %s --check-prefix=LH-SYMS
+# LH-SYMS-DAG: LOCAL  HIDDEN      1 foo1
+# LH-SYMS-DAG: GLOBAL HIDDEN      1 foo2
+# LH-SYMS-DAG: GLOBAL HIDDEN      1 foo3
+# LH-SYMS-DAG: LOCAL  HIDDEN      1 foo4
+# LH-SYMS-DAG: LOCAL  HIDDEN      1 foo5
+
+## Check --skip-symbol functionality when changing symbol names.
+# RUN: echo -e "foo1 bar1\nfoo2 bar2" > %t.renames.list
+# RUN: llvm-objcopy %t.o %t4.o --redefine-syms=%t.renames.list \
+# RUN:   --skip-symbol='fo*' --wildcard
+# RUN: llvm-readelf -s %t4.o | FileCheck %s --check-prefix=RS-SYM
+# RS-SYM-DAG: foo1
+# RS-SYM-DAG: foo2
+# RS-SYM-DAG: foo3
+# RS-SYM-DAG: foo4
+# RS-SYM-DAG: foo5
+
+## Check --skip-symbols functionality when changing symbol names.
+# RUN: llvm-objcopy %t.o %t3.o --redefine-syms=%t.renames.list \
+# RUN:   --skip-symbols=%t.skip.regex --regex
+# RUN: llvm-readelf -s %t3.o | FileCheck %s --check-prefix=RS-SYMS
+# RS-SYMS-DAG: bar1
+# RS-SYMS-DAG: foo2
+# RS-SYMS-DAG: foo3
+# RS-SYMS-DAG: foo4
+# RS-SYMS-DAG: foo5
+
+## Check the functionality when using skip options multiple times.
+# RUN: echo "foo3" > %t.symbol0.list
+# RUN: echo "foo4" > %t.symbol1.list
+# RUN: llvm-objcopy %t.o %t5.o --set-symbol-visibility='foo*'=internal --wildcard \
+# RUN:   --skip-symbol=foo1 --skip-symbol=foo2 \
+# RUN:   --skip-symbols=%t.symbol0.list --skip-symbols=%t.symbol1.list
+# RUN: llvm-readelf -s %t5.o | FileCheck %s --check-prefix=BOTH
+# BOTH-DAG: GLOBAL HIDDEN      1 foo1
+# BOTH-DAG: GLOBAL HIDDEN      1 foo2
+# BOTH-DAG: GLOBAL HIDDEN      1 foo3
+# BOTH-DAG: GLOBAL HIDDEN      1 foo4
+## Only foo5 is not skipped.
+# BOTH-DAG: GLOBAL INTERNAL    1 foo5
+
+## Check that using an invalid symbol name regex generates an error.
+# RUN: echo '*.' > %t.symbols.regex
+# RUN: not llvm-objcopy %t.o --skip-symbols=%t.symbols.regex --regex 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=SYMBOL
+# RUN: not llvm-objcopy %t.o --skip-symbol='*.' --regex 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=SYMBOL
+# SYMBOL: error: cannot compile regular expression '*.': repetition-operator operand invalid
+
+## Check passing an invalid filename generates an error.
+# RUN: not llvm-objcopy %t.o --skip-symbols=no_file 2>&1 | \
+# RUN:   FileCheck %s --check-prefix=FILE -DMSG=%errc_ENOENT
+# FILE: error: 'no_file': [[MSG]]
+
+!ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_X86_64
+Sections:
+  - Name: .text
+    Type: SHT_PROGBITS
+Symbols:
+  - Name:    foo1
+    Section: .text
+    Binding: STB_GLOBAL
+    Other:   [ STV_HIDDEN ]
+  - Name:    foo2
+    Section: .text
+    Binding: STB_GLOBAL
+    Other:   [ STV_HIDDEN ]
+  - Name:    foo3
+    Section: .text
+    Binding: STB_GLOBAL
+    Other:   [ STV_HIDDEN ]
+  - Name:    foo4
+    Section: .text
+    Binding: STB_GLOBAL
+    Other:   [ STV_HIDDEN ]
+  - Name:    foo5
+    Section: .text
+    Binding: STB_GLOBAL
+    Other:   [ STV_HIDDEN ]
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
index a0c6415bf0e6..7269c51a08d6 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
+++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp
@@ -978,6 +978,15 @@ objcopy::parseObjcopyOptions(ArrayRef<const char *> RawArgsArr,
             addSymbolsFromFile(Config.SymbolsToKeep, DC.Alloc, Arg->getValue(),
                                SymbolMatchStyle, ErrorCallback))
       return std::move(E);
+  for (auto *Arg : InputArgs.filtered(OBJCOPY_skip_symbol))
+    if (Error E = Config.SymbolsToSkip.addMatcher(NameOrPattern::create(
+            Arg->getValue(), SymbolMatchStyle, ErrorCallback)))
+      return std::move(E);
+  for (auto *Arg : InputArgs.filtered(OBJCOPY_skip_symbols))
+    if (Error E =
+            addSymbolsFromFile(Config.SymbolsToSkip, DC.Alloc, Arg->getValue(),
+                               SymbolMatchStyle, ErrorCallback))
+      return std::move(E);
   for (auto *Arg : InputArgs.filtered(OBJCOPY_add_symbol)) {
     Expected<NewSymbolInfo> SymInfo = parseNewSymbolInfo(Arg->getValue());
     if (!SymInfo)
diff --git a/llvm/tools/llvm-objcopy/ObjcopyOpts.td b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
index 3c0e5cd475a3..be02616e8c68 100644
--- a/llvm/tools/llvm-objcopy/ObjcopyOpts.td
+++ b/llvm/tools/llvm-objcopy/ObjcopyOpts.td
@@ -206,6 +206,20 @@ defm keep_symbols
          "be repeated to read symbols from many files">,
       MetaVarName<"filename">;
 
+defm skip_symbol : Eq<"skip-symbol", "Do not change parameters of symbol <symbol> "
+                        "when executing other options that can change the symbol's "
+                        "name, binding or visibility">,
+                     MetaVarName<"symbol">;
+
+defm skip_symbols
+    : Eq<"skip-symbols",
+         "Read a list of symbols from <filename> and run as if "
+         "--skip-symbol=<symbol> is set for each one. <filename> "
+         "contains one symbol per line and may contain comments beginning with "
+         "'#'. Leading and trailing whitespace is stripped from each line. May "
+         "be repeated to read symbols from many files">,
+      MetaVarName<"filename">;
+
 defm dump_section
     : Eq<"dump-section",
          "Dump contents of section named <section> into file <file>">,
diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp
index 34a162a5514e..8ec120d70e99 100644
--- a/llvm/unittests/IR/ConstantRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantRangeTest.cpp
@@ -2479,6 +2479,24 @@ TEST_F(ConstantRangeTest, castOps) {
   ConstantRange IntToPtr = A.castOp(Instruction::IntToPtr, 64);
   EXPECT_EQ(64u, IntToPtr.getBitWidth());
   EXPECT_TRUE(IntToPtr.isFullSet());
+
+  ConstantRange UIToFP = A.castOp(Instruction::UIToFP, 16);
+  EXPECT_EQ(16u, UIToFP.getBitWidth());
+  EXPECT_TRUE(UIToFP.isFullSet());
+
+  ConstantRange UIToFP2 = A.castOp(Instruction::UIToFP, 64);
+  ConstantRange B(APInt(64, 0), APInt(64, 65536));
+  EXPECT_EQ(64u, UIToFP2.getBitWidth());
+  EXPECT_EQ(B, UIToFP2);
+
+  ConstantRange SIToFP = A.castOp(Instruction::SIToFP, 16);
+  EXPECT_EQ(16u, SIToFP.getBitWidth());
+  EXPECT_TRUE(SIToFP.isFullSet());
+
+  ConstantRange SIToFP2 = A.castOp(Instruction::SIToFP, 64);
+  ConstantRange C(APInt(64, -32768), APInt(64, 32768));
+  EXPECT_EQ(64u, SIToFP2.getBitWidth());
+  EXPECT_EQ(C, SIToFP2);
 }
 
 TEST_F(ConstantRangeTest, binaryAnd) {
diff --git a/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn
index 01c2b6ced336..9230ac79a48a 100644
--- a/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn
@@ -19,6 +19,7 @@ unittest("StaticAnalysisTests") {
     "CallEventTest.cpp",
     "ConflictingEvalCallsTest.cpp",
     "FalsePositiveRefutationBRVisitorTest.cpp",
+    "MemRegionDescriptiveNameTest.cpp",
     "NoStateChangeFuncVisitorTest.cpp",
     "ParamRegionTest.cpp",
     "RangeSetTest.cpp",
diff --git a/mlir/docs/DataLayout.md b/mlir/docs/DataLayout.md
index b9dde30519d6..86ad51a517ae 100644
--- a/mlir/docs/DataLayout.md
+++ b/mlir/docs/DataLayout.md
@@ -77,6 +77,7 @@ public:
   llvm::TypeSize getTypeSizeInBits(Type type) const;
   uint64_t getTypeABIAlignment(Type type) const;
   uint64_t getTypePreferredAlignment(Type type) const;
+  std::optional<uint64_t> getTypeIndexBitwidth(Type type) const;
 };
 ```
 
@@ -267,7 +268,8 @@ module attributes { dlti.dl_spec = #dlti.dl_spec<
 >} {}
 ```
 
-specifies that `index` has 32 bits. All other layout properties of `index` match
+specifies that `index` has 32 bits and index computations should be performed
+using 32-bit precision as well. All other layout properties of `index` match
 those of the integer type with the same bitwidth defined above.
 
 In absence of the corresponding entry, `index` is assumed to be a 64-bit
diff --git a/mlir/docs/Tutorials/transform/Ch2.md b/mlir/docs/Tutorials/transform/Ch2.md
index 1aaefd2f2c30..6a6cefd8785a 100644
--- a/mlir/docs/Tutorials/transform/Ch2.md
+++ b/mlir/docs/Tutorials/transform/Ch2.md
@@ -62,7 +62,7 @@ The operations themselves can be defined using ODS, exactly in the same way as r
 #define MY_EXTENSION
 
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
@@ -124,7 +124,7 @@ This will generate two files, `MyExtension.h.inc` and `MyExtension.cpp.inc`, tha
 ```c++
 // In MyExtension.h.
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 
 #define GET_OP_CLASSES
 #include "MyExtension.h.inc"
diff --git a/mlir/examples/transform/Ch2/include/MyExtension.h b/mlir/examples/transform/Ch2/include/MyExtension.h
index 03a24a190e15..5ab70a505aee 100644
--- a/mlir/examples/transform/Ch2/include/MyExtension.h
+++ b/mlir/examples/transform/Ch2/include/MyExtension.h
@@ -13,7 +13,7 @@
 
 #include "mlir/Bytecode/BytecodeOpInterface.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 
 #define GET_OP_CLASSES
 #include "MyExtension.h.inc"
diff --git a/mlir/examples/transform/Ch2/include/MyExtension.td b/mlir/examples/transform/Ch2/include/MyExtension.td
index 4824b83e6c18..1abd95237055 100644
--- a/mlir/examples/transform/Ch2/include/MyExtension.td
+++ b/mlir/examples/transform/Ch2/include/MyExtension.td
@@ -15,7 +15,7 @@
 #define MY_EXTENSION
 
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
diff --git a/mlir/examples/transform/Ch2/lib/MyExtension.cpp b/mlir/examples/transform/Ch2/lib/MyExtension.cpp
index 031c52c30738..b2955a905b88 100644
--- a/mlir/examples/transform/Ch2/lib/MyExtension.cpp
+++ b/mlir/examples/transform/Ch2/lib/MyExtension.cpp
@@ -15,8 +15,8 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformTypes.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/DialectRegistry.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
diff --git a/mlir/examples/transform/Ch3/include/MyExtension.h b/mlir/examples/transform/Ch3/include/MyExtension.h
index 223638eee1c0..086850403e1c 100644
--- a/mlir/examples/transform/Ch3/include/MyExtension.h
+++ b/mlir/examples/transform/Ch3/include/MyExtension.h
@@ -13,13 +13,16 @@
 
 #include "mlir/Bytecode/BytecodeOpInterface.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 
 namespace mlir {
 class CallOpInterface;
 namespace func {
 class CallOp;
 } // namespace func
+namespace transform {
+class OperationType;
+} // namespace transform
 } // namespace mlir
 
 #define GET_TYPEDEF_CLASSES
diff --git a/mlir/examples/transform/Ch3/include/MyExtension.td b/mlir/examples/transform/Ch3/include/MyExtension.td
index f444df18d69e..5a78186d75c7 100644
--- a/mlir/examples/transform/Ch3/include/MyExtension.td
+++ b/mlir/examples/transform/Ch3/include/MyExtension.td
@@ -16,7 +16,7 @@
 
 include "MyExtensionTypes.td"
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformTypes.td"
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
diff --git a/mlir/examples/transform/Ch3/include/MyExtensionTypes.td b/mlir/examples/transform/Ch3/include/MyExtensionTypes.td
index 7d745935d478..8c4b8a9c782b 100644
--- a/mlir/examples/transform/Ch3/include/MyExtensionTypes.td
+++ b/mlir/examples/transform/Ch3/include/MyExtensionTypes.td
@@ -16,7 +16,7 @@
 
 include "mlir/IR/AttrTypeBase.td"
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 
 // Transform dialect allows additional types to be defined and injected.
 def CallOpInterfaceHandle
diff --git a/mlir/examples/transform/Ch3/lib/MyExtension.cpp b/mlir/examples/transform/Ch3/lib/MyExtension.cpp
index dc0a8a0ab303..2e4388d4cc22 100644
--- a/mlir/examples/transform/Ch3/lib/MyExtension.cpp
+++ b/mlir/examples/transform/Ch3/lib/MyExtension.cpp
@@ -15,6 +15,7 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/TransformTypes.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/Interfaces/CallInterfaces.h"
 #include "llvm/ADT/TypeSwitch.h"
diff --git a/mlir/examples/transform/Ch4/include/MyExtension.h b/mlir/examples/transform/Ch4/include/MyExtension.h
index 13e5b3c04b02..620ec8f398a5 100644
--- a/mlir/examples/transform/Ch4/include/MyExtension.h
+++ b/mlir/examples/transform/Ch4/include/MyExtension.h
@@ -13,8 +13,8 @@
 
 #include "mlir/Bytecode/BytecodeOpInterface.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 
 namespace mlir {
 class CallOpInterface;
diff --git a/mlir/examples/transform/Ch4/include/MyExtension.td b/mlir/examples/transform/Ch4/include/MyExtension.td
index ae58dc37db43..6c83ff0f46c8 100644
--- a/mlir/examples/transform/Ch4/include/MyExtension.td
+++ b/mlir/examples/transform/Ch4/include/MyExtension.td
@@ -16,7 +16,7 @@
 
 include "mlir/Dialect/Transform/IR/MatchInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
diff --git a/mlir/include/mlir/Conversion/MemRefToEmitC/MemRefToEmitC.h b/mlir/include/mlir/Conversion/MemRefToEmitC/MemRefToEmitC.h
new file mode 100644
index 000000000000..734ffdba520c
--- /dev/null
+++ b/mlir/include/mlir/Conversion/MemRefToEmitC/MemRefToEmitC.h
@@ -0,0 +1,21 @@
+//===- MemRefToEmitC.h - Convert MemRef to EmitC --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_MEMREFTOEMITC_MEMREFTOEMITC_H
+#define MLIR_CONVERSION_MEMREFTOEMITC_MEMREFTOEMITC_H
+
+namespace mlir {
+class RewritePatternSet;
+class TypeConverter;
+
+void populateMemRefToEmitCTypeConversion(TypeConverter &typeConverter);
+
+void populateMemRefToEmitCConversionPatterns(RewritePatternSet &patterns,
+                                             TypeConverter &converter);
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_MEMREFTOEMITC_MEMREFTOEMITC_H
diff --git a/mlir/include/mlir/Conversion/MemRefToEmitC/MemRefToEmitCPass.h b/mlir/include/mlir/Conversion/MemRefToEmitC/MemRefToEmitCPass.h
new file mode 100644
index 000000000000..4a63014c19ad
--- /dev/null
+++ b/mlir/include/mlir/Conversion/MemRefToEmitC/MemRefToEmitCPass.h
@@ -0,0 +1,20 @@
+//===- MemRefToEmitCPass.h - A Pass to convert MemRef to EmitC ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_MEMREFTOEMITC_MEMREFTOEMITCPASS_H
+#define MLIR_CONVERSION_MEMREFTOEMITC_MEMREFTOEMITCPASS_H
+
+#include <memory>
+
+namespace mlir {
+class Pass;
+
+#define GEN_PASS_DECL_CONVERTMEMREFTOEMITC
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_MEMREFTOEMITC_MEMREFTOEMITCPASS_H
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index f2aa4fb53540..2179ae18ac07 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -45,6 +45,7 @@
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
 #include "mlir/Conversion/MathToLibm/MathToLibm.h"
 #include "mlir/Conversion/MathToSPIRV/MathToSPIRVPass.h"
+#include "mlir/Conversion/MemRefToEmitC/MemRefToEmitCPass.h"
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Conversion/MemRefToSPIRV/MemRefToSPIRVPass.h"
 #include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index bd81cc6d5323..7e7ee3a2f780 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -754,6 +754,15 @@ def ConvertMathToFuncs : Pass<"convert-math-to-funcs", "ModuleOp"> {
 }
 
 //===----------------------------------------------------------------------===//
+// MemRefToEmitC
+//===----------------------------------------------------------------------===//
+
+def ConvertMemRefToEmitC : Pass<"convert-memref-to-emitc"> {
+  let summary = "Convert MemRef dialect to EmitC dialect";
+  let dependentDialects = ["emitc::EmitCDialect"];
+}
+
+//===----------------------------------------------------------------------===//
 // MemRefToLLVM
 //===----------------------------------------------------------------------===//
 
diff --git a/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h b/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h
index 4fb4ab08a0da..dcf934c71dd1 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h
@@ -12,7 +12,7 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Transform/IR/TransformAttrs.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/RegionKindInterface.h"
 
diff --git a/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td b/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td
index 0eb670506086..8aaa87511a2b 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td
@@ -11,7 +11,7 @@
 
 include "mlir/Dialect/Transform/IR/TransformAttrs.td"
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformTypes.td"
 
 include "mlir/Interfaces/SideEffectInterfaces.td"
diff --git a/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.h b/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.h
index f52f04ada036..1001cb52b518 100644
--- a/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.h
+++ b/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.h
@@ -10,8 +10,8 @@
 #define MLIR_DIALECT_AFFINE_TRANSFORMOPS_AFFINETRANSFORMOPS_H
 
 #include "mlir/Bytecode/BytecodeOpInterface.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformTypes.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/OpImplementation.h"
 
 namespace mlir {
diff --git a/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td b/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td
index b74e4af6eedd..70b127fd063c 100644
--- a/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td
+++ b/mlir/include/mlir/Dialect/Affine/TransformOps/AffineTransformOps.td
@@ -10,7 +10,7 @@
 #define AFFINE_TRANSFORM_OPS
 
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformTypes.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
index 3a61a4b34765..2d8add82383b 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -366,10 +366,6 @@ struct BufferizationOptions {
   DefaultMemorySpaceFn defaultMemorySpaceFn =
       [](TensorType t) -> std::optional<Attribute> { return Attribute(); };
 
-  /// Seed for the analysis fuzzer. If set to `0`, the fuzzer is deactivated.
-  /// Should be used only with `testAnalysisOnly = true`.
-  unsigned analysisFuzzerSeed = 0;
-
   /// If set to `true`, the analysis is skipped. A buffer is copied before every
   /// write. This flag cannot be used together with `testAnalysisOnly = true`.
   bool copyBeforeWrite = false;
diff --git a/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.h b/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.h
index 75ce4b484165..1dbe29b44413 100644
--- a/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.h
+++ b/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.h
@@ -11,8 +11,8 @@
 
 #include "mlir/Bytecode/BytecodeOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformTypes.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/OpImplementation.h"
 
 namespace mlir {
diff --git a/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.td b/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.td
index 9b588eb610e5..5ace9c390e14 100644
--- a/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.td
+++ b/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.td
@@ -11,7 +11,7 @@
 
 include "mlir/Dialect/Bufferization/IR/BufferizationEnums.td"
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformTypes.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
index a29af853eb21..d50a3042aeea 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
@@ -24,7 +24,12 @@ class OneShotAnalysisState;
 
 /// Options for analysis-enabled bufferization.
 struct OneShotBufferizationOptions : public BufferizationOptions {
-  enum class AnalysisHeuristic { BottomUp, TopDown };
+  enum class AnalysisHeuristic {
+    BottomUp,
+    TopDown,
+    BottomUpFromTerminators,
+    Fuzzer
+  };
 
   OneShotBufferizationOptions() = default;
 
@@ -42,6 +47,11 @@ struct OneShotBufferizationOptions : public BufferizationOptions {
   /// Specify the functions that should not be analyzed. copyBeforeWrite will be
   /// set to true when bufferizing them.
   llvm::ArrayRef<std::string> noAnalysisFuncFilter;
+
+  /// Seed for the analysis fuzzer. Used only if the heuristic is set to
+  /// `AnalysisHeuristic::Fuzzer`. The fuzzer should be used only with
+  /// `testAnalysisOnly = true`.
+  unsigned analysisFuzzerSeed = 0;
 };
 
 /// State for analysis-enabled bufferization. This class keeps track of alias
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
index 1c3cdec81a39..1303dc2c9ae1 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
@@ -459,6 +459,24 @@ def OneShotBufferize : Pass<"one-shot-bufferize", "ModuleOp"> {
     argument is read/written and which returned values are aliasing/equivalent.
     For debugging purposes, such information can be printed with
     `test-analysis-only`.
+
+    The order in which ops are analyzed is important. The analysis is greedy and
+    ops that are analyzed earlier are more likely to bufferize in-place. The
+    heuristic can be set with `analysis-heuristic`. At the moment, the following
+    heuristics are available:
+
+    * `bottom-up` (default): Analyze ops from bottom to top.
+    * `top-down`: Analyze ops from top to bottom.
+    * `fuzzer`: Randomize the ordering of ops with `analysis-fuzzer-seed`.
+    * `bottom-up-from-terminators`: Traverse the reverse use-def chains of
+      tensor IR, starting from region branch terminators (bottom-up). Nested
+      regions are traversed before enclosing regions. Analyze the traversed ops
+      first, then analyze the remaining ops bottom-up. This heuristic is useful
+      for bufferizing loop constructs. One-Shot Bufferize currently supports
+      only such IR where yielded tensor values bufferize to equivalent region
+      iter_args, and first analyzing all ops on the path from the "yielding" op
+      to the beginning of the loop body makes it more likely for the region
+      iter_args and yielded values to bufferize to equivalent buffers.
   }];
   let options = [
     Option<"allowReturnAllocsFromLoops", "allow-return-allocs-from-loops",
diff --git a/mlir/include/mlir/Dialect/Func/TransformOps/FuncTransformOps.h b/mlir/include/mlir/Dialect/Func/TransformOps/FuncTransformOps.h
index 8d0b97da95d4..37f0ea0f2855 100644
--- a/mlir/include/mlir/Dialect/Func/TransformOps/FuncTransformOps.h
+++ b/mlir/include/mlir/Dialect/Func/TransformOps/FuncTransformOps.h
@@ -10,7 +10,7 @@
 #define MLIR_DIALECT_FUNC_TRANSFORMOPS_FUNCTRANSFORMOPS_H
 
 #include "mlir/Bytecode/BytecodeOpInterface.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/OpImplementation.h"
 
 #define GET_OP_CLASSES
diff --git a/mlir/include/mlir/Dialect/Func/TransformOps/FuncTransformOps.td b/mlir/include/mlir/Dialect/Func/TransformOps/FuncTransformOps.td
index c36fdd150556..306fbf881de6 100644
--- a/mlir/include/mlir/Dialect/Func/TransformOps/FuncTransformOps.td
+++ b/mlir/include/mlir/Dialect/Func/TransformOps/FuncTransformOps.td
@@ -10,7 +10,7 @@
 #define FUNC_TRANSFORM_OPS
 
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformTypes.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/RegionKindInterface.td"
diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
index d6612c7c0b7f..4b5f68452504 100644
--- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
@@ -10,7 +10,7 @@
 #define MLIR_DIALECT_GPU_TRANSFORMOPS_GPUTRANSFORMOPS_H
 
 #include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/PatternMatch.h"
 
diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
index 616a82b08a61..80b4547c32c1 100644
--- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
@@ -10,7 +10,7 @@
 #define GPU_TRANSFORM_OPS
 
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
 
diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h b/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h
index c65f522e0187..52fc6f4d5c71 100644
--- a/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h
@@ -11,7 +11,7 @@
 
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/PatternMatch.h"
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td
index 96cdbf01b4bd..b7176aa93ff1 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMTypes.td
@@ -123,7 +123,7 @@ def LLVMFunctionType : LLVMType<"LLVMFunction", "func"> {
 
 def LLVMPointerType : LLVMType<"LLVMPointer", "ptr", [
     DeclareTypeInterfaceMethods<DataLayoutTypeInterface, [
-      "areCompatible", "verifyEntries"]>]> {
+      "getIndexBitwidth", "areCompatible", "verifyEntries"]>]> {
   let summary = "LLVM pointer type";
   let description = [{
     The `!llvm.ptr` type is an LLVM pointer type. This type typically represents
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
index 12923663b3fb..3af642752724 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
@@ -14,7 +14,7 @@
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Transform/IR/TransformAttrs.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Utils/StructuredOpsUtils.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/RegionKindInterface.h"
@@ -38,6 +38,9 @@ class UnPackOp;
 } // namespace tensor
 
 namespace transform {
+class AnyOpType;
+class AnyValueType;
+class OperationType;
 class TransformHandleTypeInterface;
 // Types needed for builders.
 struct TileSizesSpec {};
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index bdeab55091b9..4f34016066b4 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -12,7 +12,7 @@
 include "mlir/Dialect/Linalg/TransformOps/LinalgTransformEnums.td"
 include "mlir/Dialect/Transform/IR/TransformAttrs.td"
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformTypes.td"
 include "mlir/Dialect/SCF/IR/DeviceMappingInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
diff --git a/mlir/include/mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.h b/mlir/include/mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.h
index ee7c69683148..a87767acdd3b 100644
--- a/mlir/include/mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.h
+++ b/mlir/include/mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.h
@@ -10,13 +10,16 @@
 #define MLIR_DIALECT_MEMREF_TRANSFORMOPS_MEMREFTRANSFORMOPS_H
 
 #include "mlir/Bytecode/BytecodeOpInterface.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/OpImplementation.h"
 
 namespace mlir {
 namespace memref {
 class AllocOp;
 } // namespace memref
+namespace transform {
+class OperationType;
+} // namespace transform
 } // namespace mlir
 
 #define GET_OP_CLASSES
diff --git a/mlir/include/mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.td b/mlir/include/mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.td
index 29383a3825be..2d060f3c2da6 100644
--- a/mlir/include/mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.td
@@ -10,7 +10,7 @@
 #define MEMREF_TRANSFORM_OPS
 
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformTypes.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
diff --git a/mlir/include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h b/mlir/include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h
index 1c30cc4a57d8..5179adeb09db 100644
--- a/mlir/include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h
+++ b/mlir/include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.h
@@ -11,7 +11,7 @@
 
 #include "mlir/Dialect/Transform/IR/TransformAttrs.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/RegionKindInterface.h"
 
diff --git a/mlir/include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.td b/mlir/include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.td
index bce84cb3fdea..0225562baa58 100644
--- a/mlir/include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.td
+++ b/mlir/include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.td
@@ -11,7 +11,7 @@
 
 include "mlir/Dialect/Transform/IR/TransformAttrs.td"
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformTypes.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
diff --git a/mlir/include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.h b/mlir/include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.h
index d14d63e56dc7..65ccd43b56c8 100644
--- a/mlir/include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.h
+++ b/mlir/include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.h
@@ -10,8 +10,8 @@
 #define MLIR_DIALECT_SCF_TRANSFORMOPS_SCFTRANSFORMOPS_H
 
 #include "mlir/Bytecode/BytecodeOpInterface.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformTypes.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 
diff --git a/mlir/include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.td b/mlir/include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.td
index cef73689c072..6f94cee5b019 100644
--- a/mlir/include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.td
+++ b/mlir/include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.td
@@ -10,7 +10,7 @@
 #define SCF_TRANSFORM_OPS
 
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformTypes.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
diff --git a/mlir/include/mlir/Dialect/SparseTensor/TransformOps/SparseTensorTransformOps.h b/mlir/include/mlir/Dialect/SparseTensor/TransformOps/SparseTensorTransformOps.h
index 1c52e4881dc8..54a9e2aec805 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/TransformOps/SparseTensorTransformOps.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/TransformOps/SparseTensorTransformOps.h
@@ -12,7 +12,7 @@
 #include "mlir/Dialect/Transform/IR/MatchInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformAttrs.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/RegionKindInterface.h"
 
diff --git a/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.h b/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.h
index 0e5fda6041fa..4fb777df3409 100644
--- a/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.h
+++ b/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.h
@@ -9,8 +9,8 @@
 #ifndef MLIR_DIALECT_TENSOR_TRANSFORMOPS_TENSORTRANSFORMOPS_H
 #define MLIR_DIALECT_TENSOR_TRANSFORMOPS_TENSORTRANSFORMOPS_H
 
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformTypes.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/PatternMatch.h"
 
diff --git a/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.td b/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.td
index 39e1d7fa3494..fea5afa0b7bb 100644
--- a/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.td
+++ b/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.td
@@ -10,7 +10,7 @@
 #define TENSOR_TRANSFORM_OPS
 
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformTypes.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
diff --git a/mlir/include/mlir/Dialect/Transform/CMakeLists.txt b/mlir/include/mlir/Dialect/Transform/CMakeLists.txt
index ef44ab891cc5..0cd71ec6919d 100644
--- a/mlir/include/mlir/Dialect/Transform/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Transform/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_subdirectory(DebugExtension)
+add_subdirectory(Interfaces)
 add_subdirectory(IR)
 add_subdirectory(LoopExtension)
 add_subdirectory(PDLExtension)
diff --git a/mlir/include/mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.h b/mlir/include/mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.h
index d660393a71e0..05abe5adbe80 100644
--- a/mlir/include/mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.h
+++ b/mlir/include/mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.h
@@ -12,7 +12,7 @@
 #include "mlir/Bytecode/BytecodeOpInterface.h"
 #include "mlir/Dialect/Transform/IR/MatchInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
diff --git a/mlir/include/mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.td b/mlir/include/mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.td
index 16e2a39044b4..dc9b7c4229ac 100644
--- a/mlir/include/mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.td
+++ b/mlir/include/mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.td
@@ -17,7 +17,7 @@
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "mlir/Dialect/Transform/IR/MatchInterfaces.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
 
 def DebugEmitRemarkAtOp : TransformDialectOp<"debug.emit_remark_at",
diff --git a/mlir/include/mlir/Dialect/Transform/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Transform/IR/CMakeLists.txt
index 2b8a887257ca..3ccac2f5c165 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Transform/IR/CMakeLists.txt
@@ -24,17 +24,7 @@ add_dependencies(mlir-headers MLIRTransformDialectEnumIncGen)
 add_mlir_dialect(TransformOps transform)
 add_mlir_doc(TransformOps TransformOps Dialects/ -gen-op-doc -dialect=transform)
 
-# Contrary to what the name claims, this only produces the _op_ interface.
-add_mlir_interface(TransformInterfaces)
-add_mlir_doc(TransformInterfaces TransformOpInterfaces Dialects/ -gen-op-interface-docs)
-
 add_mlir_interface(MatchInterfaces)
 add_dependencies(MLIRMatchInterfacesIncGen MLIRTransformInterfacesIncGen)
 add_mlir_doc(TransformInterfaces MatchOpInterfaces Dialects/ -gen-op-interface-docs)
 
-set(LLVM_TARGET_DEFINITIONS TransformInterfaces.td)
-mlir_tablegen(TransformTypeInterfaces.h.inc -gen-type-interface-decls)
-mlir_tablegen(TransformTypeInterfaces.cpp.inc -gen-type-interface-defs)
-add_public_tablegen_target(MLIRTransformDialectTypeInterfacesIncGen)
-add_dependencies(mlir-headers MLIRTransformDialectTypeInterfacesIncGen)
-add_mlir_doc(TransformInterfaces TransformTypeInterfaces Dialects/ -gen-type-interface-docs)
diff --git a/mlir/include/mlir/Dialect/Transform/IR/MatchInterfaces.h b/mlir/include/mlir/Dialect/Transform/IR/MatchInterfaces.h
index 36aeb4583029..13a52b54201e 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/MatchInterfaces.h
+++ b/mlir/include/mlir/Dialect/Transform/IR/MatchInterfaces.h
@@ -12,7 +12,7 @@
 #include <optional>
 #include <type_traits>
 
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/OpDefinition.h"
 #include "llvm/ADT/STLExtras.h"
 
diff --git a/mlir/include/mlir/Dialect/Transform/IR/MatchInterfaces.td b/mlir/include/mlir/Dialect/Transform/IR/MatchInterfaces.td
index be92e4d91b42..56d2ac648599 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/MatchInterfaces.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/MatchInterfaces.td
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 include "mlir/IR/OpBase.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 
 def MatchOpInterface
     : OpInterface<"MatchOpInterface", [TransformOpInterface]> {
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td b/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td
index 33c2e7ce0e6d..d03049e186f9 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td
@@ -23,7 +23,7 @@ def Transform_Dialect : Dialect {
     /// Symbol name for the default entry point "named sequence".
     constexpr const static ::llvm::StringLiteral
         kTransformEntryPointSymbolName = "__transform_main";
-    
+
     /// Name of the attribute attachable to the symbol table operation
     /// containing named sequences. This is used to trigger verification.
     constexpr const static ::llvm::StringLiteral
@@ -34,12 +34,6 @@ def Transform_Dialect : Dialect {
     constexpr const static ::llvm::StringLiteral kTargetTagAttrName =
         "transform.target_tag";
 
-    /// Name of the attribute attachable to an operation, indicating that
-    /// TrackingListener failures should be silenced.
-    constexpr const static ::llvm::StringLiteral
-        kSilenceTrackingFailuresAttrName =
-            "transform.silence_tracking_failures";
-
     /// Names of the attributes indicating whether an argument of an external
     /// transform dialect symbol is consumed or only read.
     constexpr const static ::llvm::StringLiteral kArgConsumedAttrName =
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.h b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.h
index 1f46e34171b8..6c10fcf75804 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.h
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.h
@@ -13,8 +13,8 @@
 #include "mlir/Dialect/Transform/IR/MatchInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformAttrs.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformTypes.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/PatternMatch.h"
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
index 1766e4bb875f..9caa7632c177 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
@@ -21,7 +21,7 @@ include "mlir/IR/SymbolInterfaces.td"
 include "mlir/Dialect/Transform/IR/MatchInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformAttrs.td"
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 
 def AlternativesOp : TransformDialectOp<"alternatives",
     [DeclareOpInterfaceMethods<RegionBranchOpInterface,
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformTypes.h b/mlir/include/mlir/Dialect/Transform/IR/TransformTypes.h
index bf0fc208829a..a0d54253c80d 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformTypes.h
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformTypes.h
@@ -9,6 +9,7 @@
 #ifndef MLIR_DIALECT_TRANSFORM_IR_TRANSFORMTYPES_H
 #define MLIR_DIALECT_TRANSFORM_IR_TRANSFORMTYPES_H
 
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/Utils/DiagnosedSilenceableFailure.h"
 #include "mlir/IR/Types.h"
 #include "mlir/Support/LLVM.h"
@@ -19,7 +20,6 @@ class Operation;
 class Type;
 } // namespace mlir
 
-#include "mlir/Dialect/Transform/IR/TransformTypeInterfaces.h.inc"
 #define GET_TYPEDEF_CLASSES
 #include "mlir/Dialect/Transform/IR/TransformTypes.h.inc"
 
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformTypes.td b/mlir/include/mlir/Dialect/Transform/IR/TransformTypes.td
index 44dfce913d15..2d9a26e165b6 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformTypes.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformTypes.td
@@ -10,7 +10,7 @@
 #define MLIR_DIALECT_TRANSFORM_IR_TRANSFORMTYPES
 
 include "mlir/IR/AttrTypeBase.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
 
 def Transform_AffineMapParamType : TypeDef<Transform_Dialect, "AffineMapParam",
diff --git a/mlir/include/mlir/Dialect/Transform/Interfaces/CMakeLists.txt b/mlir/include/mlir/Dialect/Transform/Interfaces/CMakeLists.txt
new file mode 100644
index 000000000000..b3396b67b4f7
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Transform/Interfaces/CMakeLists.txt
@@ -0,0 +1,11 @@
+# Contrary to what the name claims, this only produces the _op_ interface. The
+# type interface is produced manually below.
+add_mlir_interface(TransformInterfaces)
+add_mlir_doc(TransformInterfaces TransformOpInterfaces Dialects/ -gen-op-interface-docs)
+
+set(LLVM_TARGET_DEFINITIONS TransformInterfaces.td)
+mlir_tablegen(TransformTypeInterfaces.h.inc -gen-type-interface-decls)
+mlir_tablegen(TransformTypeInterfaces.cpp.inc -gen-type-interface-defs)
+add_public_tablegen_target(MLIRTransformDialectTypeInterfacesIncGen)
+add_dependencies(mlir-headers MLIRTransformDialectTypeInterfacesIncGen)
+add_mlir_doc(TransformInterfaces TransformTypeInterfaces Dialects/ -gen-type-interface-docs)
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformInterfaces.h b/mlir/include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.h
index 32724ff4b98e..59cc2f22c938 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformInterfaces.h
+++ b/mlir/include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.h
@@ -6,10 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_DIALECT_TRANSFORM_IR_TRANSFORMINTERFACES_H
-#define MLIR_DIALECT_TRANSFORM_IR_TRANSFORMINTERFACES_H
+#ifndef MLIR_DIALECT_TRANSFORM_INTERFACES_TRANSFORMINTERFACES_H
+#define MLIR_DIALECT_TRANSFORM_INTERFACES_TRANSFORMINTERFACES_H
 
-#include "mlir/Dialect/Transform/IR/TransformTypes.h"
 #include "mlir/Dialect/Transform/Utils/DiagnosedSilenceableFailure.h"
 #include "mlir/Dialect/Transform/Utils/RaggedArray.h"
 #include "mlir/IR/OpDefinition.h"
@@ -18,6 +17,8 @@
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
 
+#include "mlir/Dialect/Transform/Interfaces/TransformTypeInterfaces.h.inc"
+
 namespace mlir {
 namespace transform {
 
@@ -74,7 +75,7 @@ getConsumedHandleOpOperands(transform::TransformOpInterface transformOp);
 } // namespace transform
 } // namespace mlir
 
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h.inc"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h.inc"
 
 namespace mlir {
 namespace transform {
@@ -1591,4 +1592,4 @@ mlir::transform::TransformEachOpTrait<OpTy>::verifyTrait(Operation *op) {
   return success();
 }
 
-#endif // DIALECT_TRANSFORM_IR_TRANSFORMINTERFACES_H
+#endif // DIALECT_TRANSFORM_INTERFACES_TRANSFORMINTERFACES_H
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformInterfaces.td b/mlir/include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.td
index 8f7b8f1999e0..c5c4c61bc2fe 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformInterfaces.td
+++ b/mlir/include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.td
@@ -241,6 +241,14 @@ def FindPayloadReplacementOpInterface
       /*arguments=*/(ins)
     >,
   ];
+
+  let extraSharedClassDeclaration = [{
+    /// Name of the attribute attachable to an operation, indicating that
+    /// TrackingListener failures should be silenced.
+    constexpr const static ::llvm::StringLiteral
+        kSilenceTrackingFailuresAttrName =
+            "transform.silence_tracking_failures";
+  }];
 }
 
 def PatternDescriptorOpInterface : OpInterface<"PatternDescriptorOpInterface"> {
diff --git a/mlir/include/mlir/Dialect/Transform/LoopExtension/LoopExtensionOps.h b/mlir/include/mlir/Dialect/Transform/LoopExtension/LoopExtensionOps.h
index 68cc0699d081..f3f696bb0b9c 100644
--- a/mlir/include/mlir/Dialect/Transform/LoopExtension/LoopExtensionOps.h
+++ b/mlir/include/mlir/Dialect/Transform/LoopExtension/LoopExtensionOps.h
@@ -11,7 +11,7 @@
 
 #include "mlir/Bytecode/BytecodeOpInterface.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
diff --git a/mlir/include/mlir/Dialect/Transform/LoopExtension/LoopExtensionOps.td b/mlir/include/mlir/Dialect/Transform/LoopExtension/LoopExtensionOps.td
index 78a8c6ad489a..885b55811e62 100644
--- a/mlir/include/mlir/Dialect/Transform/LoopExtension/LoopExtensionOps.td
+++ b/mlir/include/mlir/Dialect/Transform/LoopExtension/LoopExtensionOps.td
@@ -10,7 +10,7 @@
 #define MLIR_DIALECT_TRANSFORM_LOOPEXTENSION_LOOPEXTENSIONOPS
 
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
 def HoistLoopInvariantSubsetsOp
diff --git a/mlir/include/mlir/Dialect/Transform/PDLExtension/PDLExtensionOps.h b/mlir/include/mlir/Dialect/Transform/PDLExtension/PDLExtensionOps.h
index 5172bcf204e5..7f52e00ec30b 100644
--- a/mlir/include/mlir/Dialect/Transform/PDLExtension/PDLExtensionOps.h
+++ b/mlir/include/mlir/Dialect/Transform/PDLExtension/PDLExtensionOps.h
@@ -11,7 +11,7 @@
 
 #include "mlir/Bytecode/BytecodeOpInterface.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/SymbolTable.h"
diff --git a/mlir/include/mlir/Dialect/Transform/PDLExtension/PDLExtensionOps.td b/mlir/include/mlir/Dialect/Transform/PDLExtension/PDLExtensionOps.td
index 206a799690aa..4c4d230d2ae4 100644
--- a/mlir/include/mlir/Dialect/Transform/PDLExtension/PDLExtensionOps.td
+++ b/mlir/include/mlir/Dialect/Transform/PDLExtension/PDLExtensionOps.td
@@ -10,7 +10,7 @@
 #define MLIR_DIALECT_TRANSFORM_PDLEXTENSION_PDLEXTENSIONOPS
 
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpAsmInterface.td"
 include "mlir/IR/SymbolInterfaces.td"
diff --git a/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterPassBase.h b/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterPassBase.h
index 16ef0bc6a739..3a4b391fd7f4 100644
--- a/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterPassBase.h
+++ b/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterPassBase.h
@@ -14,7 +14,7 @@
 #ifndef MLIR_DIALECT_TRANSFORM_TRANSFORMS_TRANSFORMINTERPRETERPASSBASE_H
 #define MLIR_DIALECT_TRANSFORM_TRANSFORMS_TRANSFORMINTERPRETERPASSBASE_H
 
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include <memory>
diff --git a/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h b/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h
index 738e0c533c6b..4c16d40d368e 100644
--- a/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h
+++ b/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h
@@ -10,7 +10,7 @@
 #define MLIR_DIALECT_TRANSFORM_TRANSFORMS_TRANSFORMINTERPRETERUTILS_H
 
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include <memory>
diff --git a/mlir/include/mlir/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.h b/mlir/include/mlir/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.h
new file mode 100644
index 000000000000..31e19ff1ad39
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.h
@@ -0,0 +1,104 @@
+//===- ScalableValueBoundsConstraintSet.h - Scalable Value Bounds ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_VECTOR_IR_SCALABLEVALUEBOUNDSCONSTRAINTSET_H
+#define MLIR_DIALECT_VECTOR_IR_SCALABLEVALUEBOUNDSCONSTRAINTSET_H
+
+#include "mlir/Analysis/Presburger/IntegerRelation.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
+
+namespace mlir::vector {
+
+namespace detail {
+
+/// Parent class for the value bounds RTTIExtends. Uses protected inheritance to
+/// hide all ValueBoundsConstraintSet methods by default (as some do not use the
+/// ScalableValueBoundsConstraintSet, so may produce unexpected results).
+struct ValueBoundsConstraintSet : protected ::mlir::ValueBoundsConstraintSet {
+  using ::mlir::ValueBoundsConstraintSet::ValueBoundsConstraintSet;
+};
+} // namespace detail
+
+/// A version of `ValueBoundsConstraintSet` that can solve for scalable bounds.
+struct ScalableValueBoundsConstraintSet
+    : public llvm::RTTIExtends<ScalableValueBoundsConstraintSet,
+                               detail::ValueBoundsConstraintSet> {
+  ScalableValueBoundsConstraintSet(MLIRContext *context, unsigned vscaleMin,
+                                   unsigned vscaleMax)
+      : RTTIExtends(context), vscaleMin(vscaleMin), vscaleMax(vscaleMax){};
+
+  using RTTIExtends::bound;
+  using RTTIExtends::StopConditionFn;
+
+  /// A thin wrapper over an `AffineMap` which can represent a constant bound,
+  /// or a scalable bound (in terms of vscale). The `AffineMap` will always
+  /// take at most one parameter, vscale, and returns a single result, which is
+  /// the bound of value.
+  struct ConstantOrScalableBound {
+    AffineMap map;
+
+    struct BoundSize {
+      int64_t baseSize{0};
+      bool scalable{false};
+    };
+
+    /// Get the (possibly) scalable size of the bound, returns failure if
+    /// the bound cannot be represented as a single quantity.
+    FailureOr<BoundSize> getSize() const;
+  };
+
+  /// Computes a (possibly) scalable bound for a given value. This is
+  /// similar to `ValueBoundsConstraintSet::computeConstantBound()`, but
+  /// uses knowledge of the range of vscale to compute either a constant
+  /// bound, an expression in terms of vscale, or failure if no bound can
+  /// be computed.
+  ///
+  /// The resulting `AffineMap` will always take at most one parameter,
+  /// vscale, and return a single result, which is the bound of `value`.
+  ///
+  /// Note: `vscaleMin` must be `<=` to `vscaleMax`. If `vscaleMin` ==
+  /// `vscaleMax`, the resulting bound (if found), will be constant.
+  static FailureOr<ConstantOrScalableBound>
+  computeScalableBound(Value value, std::optional<int64_t> dim,
+                       unsigned vscaleMin, unsigned vscaleMax,
+                       presburger::BoundType boundType, bool closedUB = true,
+                       StopConditionFn stopCondition = nullptr);
+
+  /// Get the value of vscale. Returns `nullptr` vscale as not been encountered.
+  Value getVscaleValue() const { return vscale; }
+
+  /// Sets the value of vscale. Asserts if vscale has already been set.
+  void setVscale(vector::VectorScaleOp vscaleOp) {
+    assert(!vscale && "expected vscale to be unset");
+    vscale = vscaleOp.getResult();
+  }
+
+  /// The minimum possible value of vscale.
+  unsigned getVscaleMin() const { return vscaleMin; }
+
+  /// The maximum possible value of vscale.
+  unsigned getVscaleMax() const { return vscaleMax; }
+
+  static char ID;
+
+private:
+  const unsigned vscaleMin;
+  const unsigned vscaleMax;
+
+  // This will be set when the first `vector.vscale` operation is found within
+  // the `ValueBoundsOpInterface` implementation then reused from there on.
+  Value vscale = nullptr;
+};
+
+using ConstantOrScalableBound =
+    ScalableValueBoundsConstraintSet::ConstantOrScalableBound;
+
+} // namespace mlir::vector
+
+#endif // MLIR_DIALECT_VECTOR_IR_SCALABLEVALUEBOUNDSCONSTRAINTSET_H
diff --git a/mlir/include/mlir/Dialect/Vector/IR/ValueBoundsOpInterfaceImpl.h b/mlir/include/mlir/Dialect/Vector/IR/ValueBoundsOpInterfaceImpl.h
new file mode 100644
index 000000000000..4794bc9016c6
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Vector/IR/ValueBoundsOpInterfaceImpl.h
@@ -0,0 +1,20 @@
+//===- ValueBoundsOpInterfaceImpl.h - Impl. of ValueBoundsOpInterface -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_VECTOR_IR_VALUEBOUNDSOPINTERFACEIMPL_H
+#define MLIR_DIALECT_VECTOR_IR_VALUEBOUNDSOPINTERFACEIMPL_H
+
+namespace mlir {
+class DialectRegistry;
+
+namespace vector {
+void registerValueBoundsOpInterfaceExternalModels(DialectRegistry &registry);
+} // namespace vector
+} // namespace mlir
+
+#endif // MLIR_DIALECT_VECTOR_IR_VALUEBOUNDSOPINTERFACEIMPL_H
diff --git a/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.h b/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.h
index 1fa9057a0682..c43a8a0b02c6 100644
--- a/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.h
+++ b/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.h
@@ -9,7 +9,7 @@
 #ifndef MLIR_DIALECT_VECTOR_TRANSFORMOPS_VECTORTRANSFORMOPS_H
 #define MLIR_DIALECT_VECTOR_TRANSFORMOPS_VECTORTRANSFORMOPS_H
 
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/IR/OpImplementation.h"
diff --git a/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td b/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td
index 83df5fe27d7a..f6371f39c394 100644
--- a/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td
+++ b/mlir/include/mlir/Dialect/Vector/TransformOps/VectorTransformOps.td
@@ -10,7 +10,7 @@
 #define VECTOR_TRANSFORM_OPS
 
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/Dialect/Vector/Transforms/VectorTransformsBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
diff --git a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
index 3ce16ef361f3..35e76a8b623a 100644
--- a/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
+++ b/mlir/include/mlir/Dialect/Vector/Utils/VectorUtils.h
@@ -112,6 +112,64 @@ SmallVector<OpFoldResult> getMixedSizesXfer(bool hasTensorSemantics,
                                             Operation *xfer,
                                             RewriterBase &rewriter);
 
+/// A pattern for ops that implement `MaskableOpInterface` and that _might_ be
+/// masked (i.e. inside `vector.mask` Op region). In particular:
+///   1. Matches `SourceOp` operation, Op.
+///   2.1. If Op is masked, retrieves the masking Op, maskOp, and updates the
+///     insertion point to avoid inserting new ops into the `vector.mask` Op
+///     region (which only allows one Op).
+///   2.2 If Op is not masked, this step is skipped.
+///   3. Invokes `matchAndRewriteMaskableOp` on Op and optionally maskOp if
+///     found in step 2.1.
+///
+/// This wrapper frees patterns from re-implementing the logic to update the
+/// insertion point when a maskable Op is masked. Such patterns are still
+/// responsible for providing an updated ("rewritten") version of:
+///   a. the source Op when mask _is not_ present,
+///   b. the source Op and the masking Op when mask _is_ present.
+/// Note that the return value from `matchAndRewriteMaskableOp` depends on the
+/// case above.
+template <class SourceOp>
+struct MaskableOpRewritePattern : OpRewritePattern<SourceOp> {
+  using OpRewritePattern<SourceOp>::OpRewritePattern;
+
+private:
+  LogicalResult matchAndRewrite(SourceOp sourceOp,
+                                PatternRewriter &rewriter) const final {
+    auto maskableOp = dyn_cast<MaskableOpInterface>(sourceOp.getOperation());
+    if (!maskableOp)
+      return failure();
+
+    Operation *rootOp = sourceOp;
+
+    // If this Op is masked, update the insertion point to avoid inserting into
+    // the vector.mask Op region.
+    OpBuilder::InsertionGuard guard(rewriter);
+    MaskingOpInterface maskOp;
+    if (maskableOp.isMasked()) {
+      maskOp = maskableOp.getMaskingOp();
+      rewriter.setInsertionPoint(maskOp);
+      rootOp = maskOp;
+    }
+
+    FailureOr<Value> newOp =
+        matchAndRewriteMaskableOp(sourceOp, maskOp, rewriter);
+    if (failed(newOp))
+      return failure();
+
+    rewriter.replaceOp(rootOp, *newOp);
+    return success();
+  }
+
+public:
+  // Matches SourceOp that can potentially be masked with `maskingOp`. If the
+  // latter is present, returns an updated masking op (with a replacement for
+  // `sourceOp` nested inside). Otherwise, returns an updated `sourceOp`.
+  virtual FailureOr<Value>
+  matchAndRewriteMaskableOp(SourceOp sourceOp, MaskingOpInterface maskingOp,
+                            PatternRewriter &rewriter) const = 0;
+};
+
 } // namespace vector
 
 /// Constructs a permutation map of invariant memref indices to vector
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 7aaa4ecc7ee7..87aabdc015fe 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -9,7 +9,12 @@
 #ifndef MLIR_DIALECT_XEGPU_IR_XEGPU_H
 #define MLIR_DIALECT_XEGPU_IR_XEGPU_H
 
-#include <mlir/IR/Dialect.h>
+#include "mlir/Bytecode/BytecodeOpInterface.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/Interfaces/ShapedOpInterfaces.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
 
 namespace mlir {
 namespace xegpu {
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index bb325c272e33..cd38549f1ccf 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -10,6 +10,7 @@
 #define MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
 
 include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
+include "mlir/IR/EnumAttr.td"
 
 class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
                 string baseCppClass = "::mlir::Attribute">
@@ -17,4 +18,64 @@ class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
   let mnemonic = attrMnemonic;
 }
 
+def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
+  let parameters = (ins
+    OptionalParameter<"MemoryScopeAttr">: $memory_scope,
+    OptionalParameter<"IntegerAttr", "1">: $array_length,
+    OptionalParameter<"BoolAttr", "true">: $boundary_check
+  );
+
+  let builders = [
+    AttrBuilder<(ins
+      CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
+      CArg<"int", "1">:$array_length,
+      CArg<"bool", "true">: $boundary_check
+    )>
+  ];
+
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU Memory Scope Enums.
+//===----------------------------------------------------------------------===//
+def XeGPU_MemoryScopeGlobal: I32EnumAttrCase<"Global", 0, "global">;
+def XeGPU_MemoryScopeShared: I32EnumAttrCase<"SLM", 1, "slm">;
+def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope", 
+      "The address space of the memory the tensor descritor is created for", 
+      [XeGPU_MemoryScopeGlobal, XeGPU_MemoryScopeShared]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::xegpu";
+}
+
+def XeGPU_MemoryScopeAttr: 
+  EnumAttr<XeGPU_Dialect, XeGPU_MemoryScope, "memory_scope"> {
+    let assemblyFormat = "$value";
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU Cache Enums.
+//===----------------------------------------------------------------------===//
+def XeGPU_CachePolicyCached:        I32EnumAttrCase<"CACHED", 0, "cached">;                    // valid for read and write
+def XeGPU_CachePolicyUncached:      I32EnumAttrCase<"UNCACHED", 1, "uncached">;                // valid for read and write
+def XeGPU_CachePolicyStreaming:     I32EnumAttrCase<"STREAMING", 2, "streaming">;              // valid for read only
+def XeGPU_CachePolicyInvalid:       I32EnumAttrCase<"READ_INVALIDATE", 3, "read_invalidate">;  // valid for read only
+def XeGPU_CachePolicyWriteBack:     I32EnumAttrCase<"WRITE_BACK", 4, "write_back">;            // valid for write only
+def XeGPU_CachePolicyWriteThrough:  I32EnumAttrCase<"WRITE_THROUGH", 5, "write_through">;      // valid for write only
+
+def XeGPU_CachePolicyEnums : I32EnumAttr<"CachePolicy", "Cache policy", 
+  [XeGPU_CachePolicyCached, XeGPU_CachePolicyUncached, 
+   XeGPU_CachePolicyStreaming, XeGPU_CachePolicyInvalid,
+   XeGPU_CachePolicyWriteBack, XeGPU_CachePolicyWriteThrough]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::xegpu";
+}
+
+def XeGPU_CacheHintAttr 
+  : EnumAttr<XeGPU_Dialect, XeGPU_CachePolicyEnums, "cache_hint"> {
+    let assemblyFormat = "`<` $value `>`";
+}
+
+
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
index 3851275ad30a..c2f09319c790 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td
@@ -23,8 +23,8 @@ def XeGPU_Dialect : Dialect {
       the lower-level GPU compiler.
     }];
 
-    // let useDefaultTypePrinterParser = true;
-    // let useDefaultAttributePrinterParser = true;
+    let useDefaultTypePrinterParser = true;
+    let useDefaultAttributePrinterParser = true;
 }
 
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUDIALECT_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 5825ef9195b0..93c56ad05b43 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -9,10 +9,13 @@
 #ifndef MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
 #define MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
 
+include "mlir/IR/AttrTypeBase.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td"
-
+include "mlir/Interfaces/ShapedOpInterfaces.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/ViewLikeInterface.td"
 
 // Base class for dialect operations. This operation inherits from the base
 // `Op` class in OpBase.td, and provides:
@@ -20,7 +23,305 @@ include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td"
 //   * The mnemonic for the operation, or the name without the dialect prefix.
 //   * A list of traits for the operation.
 class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
-          Op<XeGPU_Dialect, mnemonic, traits>;
+          Op<XeGPU_Dialect, mnemonic, traits> {
+
+  code extraBaseClassDeclaration = [{
+    void printProperties(::mlir::MLIRContext *ctx,
+            ::mlir::OpAsmPrinter &p, const Properties &prop) {
+      Attribute propAttr = getPropertiesAsAttr(ctx, prop);
+      if (propAttr)
+        p << "<" << propAttr << ">";
+    }
+
+    static ::mlir::ParseResult parseProperties(::mlir::OpAsmParser &parser,
+                                     ::mlir::OperationState &result) {
+      if (mlir::succeeded(parser.parseLess())) {
+        if (parser.parseAttribute(result.propertiesAttr) || parser.parseGreater())
+          return failure();
+      }
+      return success();
+    }
+
+  }];
+}
+
+
+def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface, 
+                        AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> {
+
+  let summary = "Create nd-tensor descriptor operation";
+  let description = [{
+    The "create_nd_tdesc" operation creates a TensorDescType which represents
+    a sub-view of a 2D memory region (It can be extended to support n-D memory
+    region if needed in future). Elements in the subview continuous in each 
+    dimention. It encodes the following important information for supporting 
+    Intel hardware features:
+
+    * source: an object representing (starting address/pointer of) a 2D memory region. 
+        It can be either a 2D memref object, or simply a pointer represented by uint64_t type.
+        for the later case, the shape and layout information of the 2D memory region should 
+        be explicitly passed via `dynamic_shape` and `dynamic_strides` parameters.
+    * offsets: two index values represents offsets from the "source" at the each dimension 
+        at which the subview of the target memory will be created. It is encoded via two
+        variables, including "dynamic_offsets" and "static_offsets", such that it can
+        accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4])).
+    * shape: the shape information of the memory region pointed by the "source".  It is 
+        typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>. 
+        But if "source" is simply a pointer represented as uint64_t type, or a memref 
+        type without shape information e.g., memref<?x?xf16>, the shape information has 
+        to be explicitly passed via the "dynamic_shape" argument. Currently "dynamic_shape" 
+        only accepts operands(e.g., [%c4096, %c4096]), not attributes(e.g., [4096, 4096]).
+    * strides: the strides of the memory region pointed by the "source". Similar to shape, 
+        it is typically encoded via the MemRefType of the source too. But if "source" is 
+        simply a pointer represented as uint64_t type, or a memref type without shape 
+        information e.g., memref<?x?xf16>, the strides information has to be explicitly 
+        passed via the "dynamic_strides" argument. And it currently only accepts operands two.
+
+    Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
+    %0 = memref.alloc() : memref<1024x1024xf32>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = xegpu.create_nd_tdesc %0[%c0, %c0]: memref<1024x1024xf32> -> TensorDesc<8x16xf32>
+
+    Example 2 (suppose the tensor shape inferred by the compiler is 8x16):
+    %0 = memref.alloc(%h, %w) : memref<?x?xf32>
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: memref<?x?xf32> -> TensorDesc<8x16xf32>
+
+    Example 3 (suppose the tensor shape inferred by the compiler is 8x16):
+    %0 = ... : ui64
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32>
+  }];
+
+  let arguments = (ins 
+    XeGPU_BaseAddrType: $source, 
+    Variadic<Index>: $offsets, 
+    Variadic<Index>: $shape, 
+    Variadic<Index>: $strides,
+    DenseI64ArrayAttr: $const_offsets,
+    OptionalAttr<DenseI64ArrayAttr>: $const_shape,
+    OptionalAttr<DenseI64ArrayAttr>: $const_strides
+  );
+  let results = (outs XeGPU_TensorDesc: $TensorDesc);
+
+  let assemblyFormat = [{
+    $source ``
+    custom<DynamicIndexList>($offsets, $const_offsets)
+    (`,` custom<DynamicIndexList>($shape, $const_shape)^
+     `,` custom<DynamicIndexList>($strides, $const_strides))?
+    attr-dict `:` type($source) `->` qualified(type($TensorDesc))
+  }];
+
+  let hasVerifier = 1;
+
+  let builders = [
+    OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType>": $source, 
+                   "llvm::ArrayRef<OpFoldResult>": $offsets)>,
+
+    OpBuilder<(ins "Type": $tdesc, "TypedValue<IntegerType> ": $source, 
+                   "llvm::ArrayRef<OpFoldResult>": $offsets,
+                   "llvm::ArrayRef<OpFoldResult>": $shape, 
+                   "llvm::ArrayRef<OpFoldResult>": $strides)>
+  ];
+
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    /// Returns the type of the source memref operand.
+    Type getSourceType() {
+      return getSource().getType();
+    }
+
+    /// Returns the type of the result TensorDesc.
+    xegpu::TensorDescType getType() {
+      return getTensorDesc().getType();
+    }
+
+    /// Return the element type of the TensorDesc
+    Type getElementType() {
+      return getType().getElementType();
+    }
+
+    /// Return the shape of the TensorDesc
+    llvm::ArrayRef<int64_t> getTensorDescShape() {
+      return getType().getShape();
+    }
+
+    /// wrapper for matching with OffsetSizeAndStrideOpInterface
+    OperandRange getSizes() {
+      return getShape();
+    }
+
+    ArrayRef<int64_t> getStaticOffsets(){
+      return getConstOffsets();
+    }
+
+    /// wrapper for matching with OffsetSizeAndStrideOpInterface
+    /// If source is IntegerType or `const_shape` is filled, 
+    /// it will return `const_shape`, such that mixes of `shape`
+    /// and `const_shape` will be used to represent the shape of 
+    /// source operand. They overide static shape from source memref type.
+    ArrayRef<int64_t> getStaticSizes() {
+      auto attr = getConstShapeAttr();
+      if (getSourceType().isa<IntegerType>() || attr)
+        return attr;
+      
+      auto memrefType = getSourceType().dyn_cast<MemRefType>();
+      assert(memrefType && "Incorrect use of getStaticSizes");
+      return memrefType.getShape();
+    }
+
+    /// wrapper for matching with OffsetSizeAndStrideOpInterface
+    /// If source is IntegerType or `const_strides` is filled, it 
+    /// will return `const_strides`, such that mixes of `strides`
+    /// and `const_strides` will be used to represent the strides of 
+    /// source operand. They overide static strides from source memref type.
+    ArrayRef<int64_t> getStaticStrides() {
+      auto attr = getConstStridesAttr();
+      if (getSourceType().isa<IntegerType>() || attr)
+        return attr;
+      
+      auto memrefType = getSourceType().dyn_cast<MemRefType>();
+      assert(memrefType && "Incorrect use of getStaticStrides");
+      auto [strides, offset] = getStridesAndOffset(memrefType);
+      // reuse the storage of ConstStridesAttr since strides from 
+      // memref is not persistant
+      setConstStrides(strides);
+      attr = getConstStridesAttr();
+      return attr;
+    }
+
+    /// Return the expected rank of each of the`static_offsets`, 
+    /// `static_shape` and `static_strides` attributes.
+    std::array<unsigned, 3> getArrayAttrMaxRanks() {
+      unsigned rank;
+      if (auto ty = getSourceType().dyn_cast<MemRefType>()) {
+        rank = ty.getRank();
+      } else {
+        rank = (unsigned)getMixedOffsets().size();
+      }
+      return {rank, rank, rank};
+    }
+    
+    /// Return the number of leading operands before the `offsets`, 
+    /// `shape` and `strides` operands.
+    static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
+
+    mlir::Value getViewSource() { return getSource(); }
+  }];
+}
+
+def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
+  let summary = "prefetches a nD block to cache";
+  let description = [{
+    It issues an instruction to prefetch the data from memory to each 
+    level of the cache based on their cache policy.
+
+    Example:
+    ```
+      xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint<cached>, 
+                                l2_hint = #xegpu.cache_hint<cached>, 
+                                l3_hint = #xegpu.cache_hint<cached>}
+        : !xegpu.tensor_desc<8x16xf16>
+    ```
+
+  }];
+
+  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+                       
+  let extraClassDeclaration = extraBaseClassDeclaration;
+
+  let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc))";
+}
+
+
+def XeGPU_LoadNdOp : XeGPU_Op<"load_nd"> {
+  let summary = "loads a n-D block from memory (represented by TensorDesc)" 
+                "to registers (represented by vector)";
+  let description = [{
+    LoadNdOp essentially mimics the hardware block read instruction to read 
+    a block of data from memory to register. It takes a set of optional cache 
+    hints for each level of cache, L1, L2 and L3. If hardware does not have a 
+    correspoding cache, Corresponding cache hint attribute will be masked.
+    vnni transform is an hardware feature for Intel GPU, which is used to 
+    do data packing during the load for B operand of matrix operation, if 
+    the bit width of the data type is less then 32 bits, e.g., fp16. And 
+    transpose is another Intel hardware feature, which will do transpose
+    operation when loading the data if the bit width of the data type is 
+    fp32 or fp64. It implies that vnni and transpose cannot exit at the 
+    same time.
+
+    Example:
+    ```
+      xegpu.load_nd %1 {transpose = [1, 0],
+                        l1_hint = #xegpu.cache_hint<cached>, 
+                        l2_hint = #xegpu.cache_hint<uncached>, 
+                        l3_hint = #xegpu.cache_hint<streaming>}
+              : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
+    ```
+
+
+  }];
+
+  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                       OptionalAttr<I64Attr>: $vnni_axis,
+                       OptionalAttr<DenseI64ArrayAttr>: $transpose,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+  let results = (outs XeGPU_ValueType: $value);
+
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    VectorType getType() {
+      return llvm::dyn_cast<VectorType>(getValue().getType());
+    }
+
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
+  }];
+
+  let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc)) `->` type($value)";
+  let hasVerifier = 1;
+}
+
+def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", []> {
+  let summary = "stores a n-D block register region back to memory, currently only supports 2D";
+
+  let description = [{
+    StoreNdOp essentially mimics the hardware block write instruction io
+    write a block of data from register into the memory region as described 
+    by the TensorDesc. It takes a set of optional cache hints for each level 
+    of cache, L1, L2 and L3. If hardware does not have a correspoding cache, 
+    Corresponding cache hint attribute will be masked.
+
+    Example:
+    ```
+      xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
+                             l2_hint = #xegpu.cache_hint<write_back>, 
+                             l3_hint = #xegpu.cache_hint<write_through>}
+                             : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
+    ```
+
+
+  }];
+
+  let arguments = (ins XeGPU_ValueType: $value,
+                       XeGPU_TensorDesc: $TensorDesc,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+  let extraClassDeclaration = extraBaseClassDeclaration;
 
+  let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict 
+                        `:` type($value) `,` qualified(type($TensorDesc))}];
+  let hasVerifier = 1;
+}
 
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 1d75bb4e2906..19ac1693712d 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -9,9 +9,9 @@
 #ifndef MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
 #define MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
 
-include "mlir/IR/BuiltinTypes.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td"
 include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
+include "mlir/IR/BuiltinTypes.td"
 
 def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64, UI1, UI8, UI16, UI32, UI64]>;
 def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>;
@@ -30,4 +30,106 @@ class XeGPUTypeDef<string name, string typeMnemonic, list<Trait> traits = [],
   let mnemonic = typeMnemonic;
 }
 
+def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
+        [ShapedTypeInterface], "::mlir::TensorType"> {
+  let summary = "TensorDesc describing regions of interested data.";
+  let description = [{
+    TensorDesc is a type designed to describe regions of the interested data as well as some 
+    features that are unique to Intel hardware. Different with the builtin tensor type in MLIR, 
+    it essentially only contains the meta data, and doesn't hold the data by itself. It is designed 
+    to mainly support 2D block load/store and DPAS (matrix multiplication instruction) on Intel GPU. 
+    It encodes the following information:
+
+    * shape:  the sizes/shape of the intereted data block, e.g., 8x16 means 8 rows
+              and each row contains 16 contiguous data element. The rows could be
+              either contiguous or not, depends on whether the encoding attribute
+              is set or not.
+    * element_type: the data type of the data element, e.g., f16, f32.
+
+    Similar to the builtin tensor, it also provides an optinal attribute to encoding 
+    the following information via the TensorDescAttr object:
+    * memory_scope (xegpu::MemoryScope): [optional] where the data is located, 
+                global memory or shared memory. It is default to Global.
+    * array_length (int): [optional] The number of contiguous blocks with size as `shape`,
+               that will be loaded by block load at a time. It is default to 1.
+    * boundary_check (bool): [optional] indicates whether the operation detects the boundary 
+                and pads with zero for out-of-boundary access. It is default to do boundary check.
+    
+
+    Syntax:
+
+    ```
+    TensorDesc-type ::= `tensor_desc` `<` dim-list element-type (attr-list)? `>`
+    element-type ::= float-type | integer-type | index-type
+    dim-list := (static-dim-list `x`)?
+    static-dim-list ::= decimal-literal `x` decimal-literal
+    attr-list = (, memory_scope = value)? (, arr_len = value)? (, boundary_check = value)?
+    ```
+
+    Examples:
+
+    ```mlir
+    // A block TensorDesc with 8x16 i32 elements
+    xegpu.tensor_desc<8x16xi32>
+
+    // A block TensorDesc with 8x16 f32 elements
+    xegpu.tensor_desc<8x16xf32>
+
+    // A TensorDesc with 8x16 f32 elements for a memory region in shared memory space.
+    xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
+    ```
+  }];
+
+  let parameters = (ins ArrayRefParameter<"int64_t">: $shape,
+                        "mlir::Type": $elementType,
+                        OptionalParameter<"mlir::Attribute">: $encoding);
+
+  let extraClassDeclaration = [{
+    using TensorType::clone;
+    using mlir::ShapedType::Trait<TensorDescType>::getElementTypeBitWidth;
+    using mlir::ShapedType::Trait<TensorDescType>::getRank;
+    using mlir::ShapedType::Trait<TensorDescType>::getNumElements;
+    using mlir::ShapedType::Trait<TensorDescType>::isDynamicDim;
+    using mlir::ShapedType::Trait<TensorDescType>::hasStaticShape;
+    using mlir::ShapedType::Trait<TensorDescType>::getNumDynamicDims;
+    using mlir::ShapedType::Trait<TensorDescType>::getDimSize;
+    using mlir::ShapedType::Trait<TensorDescType>::getDynamicDimIndex;
+
+    TensorDescType clone(::mlir::Type elementType) {
+      return llvm::cast<TensorDescType>(cloneWith(getShape(), elementType));
+    }
+
+    TensorDescAttr getEncodingAsTensorDescAttr() const {
+      return llvm::dyn_cast_if_present<TensorDescAttr>(getEncoding());
+    }
+
+    xegpu::MemoryScope getMemoryScope() const {
+      auto attr = getEncodingAsTensorDescAttr();
+      if (attr && attr.getMemoryScope())
+        return attr.getMemoryScope().getValue();
+      // return default value
+      return MemoryScope::Global;
+    }
+
+    int getArrayLength() {
+      auto attr = getEncodingAsTensorDescAttr();
+      if (attr && attr.getArrayLength())
+        return attr.getArrayLength().getInt();
+      // return default value
+      return 1; 
+    }
+
+    bool getBoundaryCheck() {
+      auto attr = getEncodingAsTensorDescAttr();
+      if (attr && attr.getBoundaryCheck())
+        return attr.getBoundaryCheck().getValue();
+      // return default value
+      return true;
+    }
+  }];
+
+  let hasCustomAssemblyFormat = true;
+  
+}
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
diff --git a/mlir/include/mlir/IR/TensorEncoding.td b/mlir/include/mlir/IR/TensorEncoding.td
index 3991520d72a5..4907dcbb5de9 100644
--- a/mlir/include/mlir/IR/TensorEncoding.td
+++ b/mlir/include/mlir/IR/TensorEncoding.td
@@ -34,8 +34,8 @@ def VerifiableTensorEncoding : AttrInterface<"VerifiableTensorEncoding"> {
       /*retTy=*/"::mlir::LogicalResult",
       /*methodName=*/"verifyEncoding",
       /*args=*/(ins
-        "ArrayRef<int64_t>":$shape,
-        "Type":$elementType,
+        "::mlir::ArrayRef<int64_t>":$shape,
+        "::mlir::Type":$elementType,
         "::llvm::function_ref<::mlir::InFlightDiagnostic()>":$emitError)
     >,
   ];
diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h
index 21775e11e071..9bbf12d13254 100644
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@@ -82,6 +82,7 @@
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Dialect/Transform/PDLExtension/PDLExtension.h"
 #include "mlir/Dialect/UB/IR/UBOps.h"
+#include "mlir/Dialect/Vector/IR/ValueBoundsOpInterfaceImpl.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h"
 #include "mlir/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.h"
@@ -174,6 +175,7 @@ inline void registerAllDialects(DialectRegistry &registry) {
   tosa::registerShardingInterfaceExternalModels(registry);
   vector::registerBufferizableOpInterfaceExternalModels(registry);
   vector::registerSubsetOpInterfaceExternalModels(registry);
+  vector::registerValueBoundsOpInterfaceExternalModels(registry);
   NVVM::registerNVVMTargetInterfaceExternalModels(registry);
   ROCDL::registerROCDLTargetInterfaceExternalModels(registry);
   spirv::registerSPIRVTargetInterfaceExternalModels(registry);
diff --git a/mlir/include/mlir/Interfaces/DataLayoutInterfaces.h b/mlir/include/mlir/Interfaces/DataLayoutInterfaces.h
index 4a21f76dfc5d..046354677e6a 100644
--- a/mlir/include/mlir/Interfaces/DataLayoutInterfaces.h
+++ b/mlir/include/mlir/Interfaces/DataLayoutInterfaces.h
@@ -57,6 +57,13 @@ uint64_t
 getDefaultPreferredAlignment(Type type, const DataLayout &dataLayout,
                              ArrayRef<DataLayoutEntryInterface> params);
 
+/// Default handler for the index bitwidth request. Computes the result for
+/// the built-in index type and dispatches to the DataLayoutTypeInterface for
+/// other types.
+std::optional<uint64_t>
+getDefaultIndexBitwidth(Type type, const DataLayout &dataLayout,
+                        ArrayRef<DataLayoutEntryInterface> params);
+
 /// Default handler for alloca memory space request. Dispatches to the
 /// DataLayoutInterface if specified, otherwise returns the default.
 Attribute getDefaultAllocaMemorySpace(DataLayoutEntryInterface entry);
@@ -180,6 +187,11 @@ public:
   /// Returns the preferred of the given type in the current scope.
   uint64_t getTypePreferredAlignment(Type t) const;
 
+  /// Returns the bitwidth that should be used when performing index
+  /// computations for the given pointer-like type in the current scope. If the
+  /// type is not a pointer-like type, it returns std::nullopt.
+  std::optional<uint64_t> getTypeIndexBitwidth(Type t) const;
+
   /// Returns the memory space used for AllocaOps.
   Attribute getAllocaMemorySpace() const;
 
@@ -216,6 +228,7 @@ private:
   mutable DenseMap<Type, llvm::TypeSize> bitsizes;
   mutable DenseMap<Type, uint64_t> abiAlignments;
   mutable DenseMap<Type, uint64_t> preferredAlignments;
+  mutable DenseMap<Type, std::optional<uint64_t>> indexBitwidths;
 
   /// Cache for alloca, global, and program memory spaces.
   mutable std::optional<Attribute> allocaMemorySpace;
diff --git a/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td b/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td
index a8def967fffc..0ee7a116d114 100644
--- a/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td
+++ b/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td
@@ -281,6 +281,22 @@ def DataLayoutOpInterface : OpInterface<"DataLayoutOpInterface"> {
       }]
     >,
     StaticInterfaceMethod<
+      /*description=*/"Returns the bitwidth that should be used when "
+                      "performing index computations for the type computed "
+                      "using the relevant entries. The data layout object can "
+                      "be used for recursive queries.",
+      /*retTy=*/"std::optional<uint64_t>",
+      /*methodName=*/"getIndexBitwidth",
+      /*args=*/(ins "::mlir::Type":$type,
+                    "const ::mlir::DataLayout &":$dataLayout,
+                    "::mlir::DataLayoutEntryListRef":$params),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        return ::mlir::detail::getDefaultIndexBitwidth(type, dataLayout,
+                                                       params);
+      }]
+    >,
+    StaticInterfaceMethod<
       /*description=*/"Returns the memory space used by the ABI computed "
                       "using the relevant entries. The data layout object "
                       "can be used for recursive queries.",
@@ -401,6 +417,18 @@ def DataLayoutTypeInterface : TypeInterface<"DataLayoutTypeInterface"> {
                     "::mlir::DataLayoutEntryListRef":$params)
     >,
     InterfaceMethod<
+      /*description=*/"Returns the bitwidth that should be used when "
+                      "performing index computations for the given "
+                      "pointer-like type. If the type is not a pointer-like "
+                      "type, returns std::nullopt.",
+      /*retTy=*/"std::optional<uint64_t>",
+      /*methodName=*/"getIndexBitwidth",
+      /*args=*/(ins "const ::mlir::DataLayout &":$dataLayout,
+                    "::mlir::DataLayoutEntryListRef":$params),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{ return std::nullopt; }]
+    >,
+    InterfaceMethod<
       /*desc=*/"Returns true if the two lists of entries are compatible, that "
                "is, that `newLayout` spec entries can be nested in an op with "
                "`oldLayout` spec entries.",
diff --git a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
index 28dadfb9ecf8..b4ed0967e63f 100644
--- a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
+++ b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
@@ -15,6 +15,7 @@
 #include "mlir/IR/Value.h"
 #include "mlir/Interfaces/DestinationStyleOpInterface.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/Support/ExtensibleRTTI.h"
 
 #include <queue>
 
@@ -63,7 +64,8 @@ using ValueDimList = SmallVector<std::pair<Value, std::optional<int64_t>>>;
 ///
 /// Note: Any modification of existing IR invalides the data stored in this
 /// class. Adding new operations is allowed.
-class ValueBoundsConstraintSet {
+class ValueBoundsConstraintSet
+    : public llvm::RTTIExtends<ValueBoundsConstraintSet, llvm::RTTIRoot> {
 protected:
   /// Helper class that builds a bound for a shaped value dimension or
   /// index-typed value.
@@ -107,6 +109,8 @@ protected:
   };
 
 public:
+  static char ID;
+
   /// The stop condition when traversing the backward slice of a shaped value/
   /// index-type value. The traversal continues until the stop condition
   /// evaluates to "true" for a value.
@@ -265,6 +269,16 @@ protected:
 
   ValueBoundsConstraintSet(MLIRContext *ctx);
 
+  /// Populates the constraint set for a value/map without actually computing
+  /// the bound. Returns the position for the value/map (via the return value
+  /// and `posOut` output parameter).
+  int64_t populateConstraintsSet(Value value,
+                                 std::optional<int64_t> dim = std::nullopt,
+                                 StopConditionFn stopCondition = nullptr);
+  int64_t populateConstraintsSet(AffineMap map, ValueDimList mapOperands,
+                                 StopConditionFn stopCondition = nullptr,
+                                 int64_t *posOut = nullptr);
+
   /// Iteratively process all elements on the worklist until an index-typed
   /// value or shaped value meets `stopCondition`. Such values are not processed
   /// any further.
diff --git a/mlir/lib/CAPI/Dialect/TransformInterpreter.cpp b/mlir/lib/CAPI/Dialect/TransformInterpreter.cpp
index 6a2cfb235fcf..eb6951dc5584 100644
--- a/mlir/lib/CAPI/Dialect/TransformInterpreter.cpp
+++ b/mlir/lib/CAPI/Dialect/TransformInterpreter.cpp
@@ -15,7 +15,7 @@
 #include "mlir/CAPI/IR.h"
 #include "mlir/CAPI/Support.h"
 #include "mlir/CAPI/Wrap.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index 8219cf98575f..41ab7046b91c 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -35,6 +35,7 @@ add_subdirectory(MathToFuncs)
 add_subdirectory(MathToLibm)
 add_subdirectory(MathToLLVM)
 add_subdirectory(MathToSPIRV)
+add_subdirectory(MemRefToEmitC)
 add_subdirectory(MemRefToLLVM)
 add_subdirectory(MemRefToSPIRV)
 add_subdirectory(NVGPUToNVVM)
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index d6a5d8cd74d5..b95fba20a00c 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -155,8 +155,6 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
     auto valueTy = adaptor.getValue().getType();
     auto int32Type = IntegerType::get(rewriter.getContext(), 32);
     auto predTy = IntegerType::get(rewriter.getContext(), 1);
-    auto resultTy = LLVM::LLVMStructType::getLiteral(rewriter.getContext(),
-                                                     {valueTy, predTy});
 
     Value one = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 1);
     Value minusOne = rewriter.create<LLVM::ConstantOp>(loc, int32Type, -1);
@@ -176,14 +174,25 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
           rewriter.create<LLVM::SubOp>(loc, int32Type, adaptor.getWidth(), one);
     }
 
-    auto returnValueAndIsValidAttr = rewriter.getUnitAttr();
+    bool predIsUsed = !op->getResult(1).use_empty();
+    UnitAttr returnValueAndIsValidAttr = nullptr;
+    Type resultTy = valueTy;
+    if (predIsUsed) {
+      returnValueAndIsValidAttr = rewriter.getUnitAttr();
+      resultTy = LLVM::LLVMStructType::getLiteral(rewriter.getContext(),
+                                                  {valueTy, predTy});
+    }
     Value shfl = rewriter.create<NVVM::ShflOp>(
         loc, resultTy, activeMask, adaptor.getValue(), adaptor.getOffset(),
         maskAndClamp, convertShflKind(op.getMode()), returnValueAndIsValidAttr);
-    Value shflValue = rewriter.create<LLVM::ExtractValueOp>(loc, shfl, 0);
-    Value isActiveSrcLane = rewriter.create<LLVM::ExtractValueOp>(loc, shfl, 1);
-
-    rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
+    if (predIsUsed) {
+      Value shflValue = rewriter.create<LLVM::ExtractValueOp>(loc, shfl, 0);
+      Value isActiveSrcLane =
+          rewriter.create<LLVM::ExtractValueOp>(loc, shfl, 1);
+      rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
+    } else {
+      rewriter.replaceOp(op, {shfl, nullptr});
+    }
     return success();
   }
 };
diff --git a/mlir/lib/Conversion/MemRefToEmitC/CMakeLists.txt b/mlir/lib/Conversion/MemRefToEmitC/CMakeLists.txt
new file mode 100644
index 000000000000..8a72e747d024
--- /dev/null
+++ b/mlir/lib/Conversion/MemRefToEmitC/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_mlir_conversion_library(MLIRMemRefToEmitC
+  MemRefToEmitC.cpp
+  MemRefToEmitCPass.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/MemRefToEmitC
+
+  DEPENDS
+  MLIRConversionPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIREmitCDialect
+  MLIRMemRefDialect
+  MLIRTransforms
+  )
diff --git a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp
new file mode 100644
index 000000000000..0e3b64692126
--- /dev/null
+++ b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp
@@ -0,0 +1,114 @@
+//===- MemRefToEmitC.cpp - MemRef to EmitC conversion ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements patterns to convert memref ops into emitc ops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/MemRefToEmitC/MemRefToEmitC.h"
+
+#include "mlir/Dialect/EmitC/IR/EmitC.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+using namespace mlir;
+
+namespace {
+struct ConvertAlloca final : public OpConversionPattern<memref::AllocaOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(memref::AllocaOp op, OpAdaptor operands,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    if (!op.getType().hasStaticShape()) {
+      return rewriter.notifyMatchFailure(
+          op.getLoc(), "cannot transform alloca with dynamic shape");
+    }
+
+    if (op.getAlignment().value_or(1) > 1) {
+      // TODO: Allow alignment if it is not more than the natural alignment
+      // of the C array.
+      return rewriter.notifyMatchFailure(
+          op.getLoc(), "cannot transform alloca with alignment requirement");
+    }
+
+    auto resultTy = getTypeConverter()->convertType(op.getType());
+    if (!resultTy) {
+      return rewriter.notifyMatchFailure(op.getLoc(), "cannot convert type");
+    }
+    auto noInit = emitc::OpaqueAttr::get(getContext(), "");
+    rewriter.replaceOpWithNewOp<emitc::VariableOp>(op, resultTy, noInit);
+    return success();
+  }
+};
+
+struct ConvertLoad final : public OpConversionPattern<memref::LoadOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(memref::LoadOp op, OpAdaptor operands,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    auto resultTy = getTypeConverter()->convertType(op.getType());
+    if (!resultTy) {
+      return rewriter.notifyMatchFailure(op.getLoc(), "cannot convert type");
+    }
+
+    auto subscript = rewriter.create<emitc::SubscriptOp>(
+        op.getLoc(), operands.getMemref(), operands.getIndices());
+
+    auto noInit = emitc::OpaqueAttr::get(getContext(), "");
+    auto var =
+        rewriter.create<emitc::VariableOp>(op.getLoc(), resultTy, noInit);
+
+    rewriter.create<emitc::AssignOp>(op.getLoc(), var, subscript);
+    rewriter.replaceOp(op, var);
+    return success();
+  }
+};
+
+struct ConvertStore final : public OpConversionPattern<memref::StoreOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(memref::StoreOp op, OpAdaptor operands,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    auto subscript = rewriter.create<emitc::SubscriptOp>(
+        op.getLoc(), operands.getMemref(), operands.getIndices());
+    rewriter.replaceOpWithNewOp<emitc::AssignOp>(op, subscript,
+                                                 operands.getValue());
+    return success();
+  }
+};
+} // namespace
+
+void mlir::populateMemRefToEmitCTypeConversion(TypeConverter &typeConverter) {
+  typeConverter.addConversion(
+      [&](MemRefType memRefType) -> std::optional<Type> {
+        if (!memRefType.hasStaticShape() ||
+            !memRefType.getLayout().isIdentity() || memRefType.getRank() == 0) {
+          return {};
+        }
+        Type convertedElementType =
+            typeConverter.convertType(memRefType.getElementType());
+        if (!convertedElementType)
+          return {};
+        return emitc::ArrayType::get(memRefType.getShape(),
+                                     convertedElementType);
+      });
+}
+
+void mlir::populateMemRefToEmitCConversionPatterns(RewritePatternSet &patterns,
+                                                   TypeConverter &converter) {
+  patterns.add<ConvertAlloca, ConvertLoad, ConvertStore>(converter,
+                                                         patterns.getContext());
+}
diff --git a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitCPass.cpp b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitCPass.cpp
new file mode 100644
index 000000000000..4e5d1912d157
--- /dev/null
+++ b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitCPass.cpp
@@ -0,0 +1,55 @@
+//===- MemRefToEmitC.cpp - MemRef to EmitC conversion ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to convert memref ops into emitc ops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/MemRefToEmitC/MemRefToEmitCPass.h"
+
+#include "mlir/Conversion/MemRefToEmitC/MemRefToEmitC.h"
+#include "mlir/Dialect/EmitC/IR/EmitC.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_CONVERTMEMREFTOEMITC
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+
+namespace {
+struct ConvertMemRefToEmitCPass
+    : public impl::ConvertMemRefToEmitCBase<ConvertMemRefToEmitCPass> {
+  void runOnOperation() override {
+    TypeConverter converter;
+
+    // Fallback for other types.
+    converter.addConversion([](Type type) -> std::optional<Type> {
+      if (isa<MemRefType>(type))
+        return {};
+      return type;
+    });
+
+    populateMemRefToEmitCTypeConversion(converter);
+
+    RewritePatternSet patterns(&getContext());
+    populateMemRefToEmitCConversionPatterns(patterns, converter);
+
+    ConversionTarget target(getContext());
+    target.addIllegalDialect<memref::MemRefDialect>();
+    target.addLegalDialect<emitc::EmitCDialect>();
+
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns))))
+      return signalPassFailure();
+  }
+};
+} // namespace
diff --git a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp
index 0acb2142f3f6..81b9f55cac80 100644
--- a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp
+++ b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRV.cpp
@@ -50,11 +50,12 @@ static Value getOffsetForBitwidth(Location loc, Value srcIdx, int sourceBits,
   assert(targetBits % sourceBits == 0);
   Type type = srcIdx.getType();
   IntegerAttr idxAttr = builder.getIntegerAttr(type, targetBits / sourceBits);
-  auto idx = builder.create<spirv::ConstantOp>(loc, type, idxAttr);
+  auto idx = builder.createOrFold<spirv::ConstantOp>(loc, type, idxAttr);
   IntegerAttr srcBitsAttr = builder.getIntegerAttr(type, sourceBits);
-  auto srcBitsValue = builder.create<spirv::ConstantOp>(loc, type, srcBitsAttr);
-  auto m = builder.create<spirv::UModOp>(loc, srcIdx, idx);
-  return builder.create<spirv::IMulOp>(loc, type, m, srcBitsValue);
+  auto srcBitsValue =
+      builder.createOrFold<spirv::ConstantOp>(loc, type, srcBitsAttr);
+  auto m = builder.createOrFold<spirv::UModOp>(loc, srcIdx, idx);
+  return builder.createOrFold<spirv::IMulOp>(loc, type, m, srcBitsValue);
 }
 
 /// Returns an adjusted spirv::AccessChainOp. Based on the
@@ -74,11 +75,11 @@ adjustAccessChainForBitwidth(const SPIRVTypeConverter &typeConverter,
   Value lastDim = op->getOperand(op.getNumOperands() - 1);
   Type type = lastDim.getType();
   IntegerAttr attr = builder.getIntegerAttr(type, targetBits / sourceBits);
-  auto idx = builder.create<spirv::ConstantOp>(loc, type, attr);
+  auto idx = builder.createOrFold<spirv::ConstantOp>(loc, type, attr);
   auto indices = llvm::to_vector<4>(op.getIndices());
   // There are two elements if this is a 1-D tensor.
   assert(indices.size() == 2);
-  indices.back() = builder.create<spirv::SDivOp>(loc, lastDim, idx);
+  indices.back() = builder.createOrFold<spirv::SDivOp>(loc, lastDim, idx);
   Type t = typeConverter.convertType(op.getComponentPtr().getType());
   return builder.create<spirv::AccessChainOp>(loc, t, op.getBasePtr(), indices);
 }
@@ -91,7 +92,8 @@ static Value castBoolToIntN(Location loc, Value srcBool, Type dstType,
     return srcBool;
   Value zero = spirv::ConstantOp::getZero(dstType, loc, builder);
   Value one = spirv::ConstantOp::getOne(dstType, loc, builder);
-  return builder.create<spirv::SelectOp>(loc, dstType, srcBool, one, zero);
+  return builder.createOrFold<spirv::SelectOp>(loc, dstType, srcBool, one,
+                                               zero);
 }
 
 /// Returns the `targetBits`-bit value shifted by the given `offset`, and cast
@@ -111,10 +113,10 @@ static Value shiftValue(Location loc, Value value, Value offset, Value mask,
           loc, builder.getIntegerType(targetBits), value);
     }
 
-    value = builder.create<spirv::BitwiseAndOp>(loc, value, mask);
+    value = builder.createOrFold<spirv::BitwiseAndOp>(loc, value, mask);
   }
-  return builder.create<spirv::ShiftLeftLogicalOp>(loc, value.getType(), value,
-                                                   offset);
+  return builder.createOrFold<spirv::ShiftLeftLogicalOp>(loc, value.getType(),
+                                                         value, offset);
 }
 
 /// Returns true if the allocations of memref `type` generated from `allocOp`
@@ -165,7 +167,7 @@ static Value castIntNToBool(Location loc, Value srcInt, OpBuilder &builder) {
     return srcInt;
 
   auto one = spirv::ConstantOp::getOne(srcInt.getType(), loc, builder);
-  return builder.create<spirv::IEqualOp>(loc, srcInt, one);
+  return builder.createOrFold<spirv::IEqualOp>(loc, srcInt, one);
 }
 
 //===----------------------------------------------------------------------===//
@@ -597,13 +599,14 @@ IntLoadOpPattern::matchAndRewrite(memref::LoadOp loadOp, OpAdaptor adaptor,
   // ____XXXX________ -> ____________XXXX
   Value lastDim = accessChainOp->getOperand(accessChainOp.getNumOperands() - 1);
   Value offset = getOffsetForBitwidth(loc, lastDim, srcBits, dstBits, rewriter);
-  Value result = rewriter.create<spirv::ShiftRightArithmeticOp>(
+  Value result = rewriter.createOrFold<spirv::ShiftRightArithmeticOp>(
       loc, spvLoadOp.getType(), spvLoadOp, offset);
 
   // Apply the mask to extract corresponding bits.
-  Value mask = rewriter.create<spirv::ConstantOp>(
+  Value mask = rewriter.createOrFold<spirv::ConstantOp>(
       loc, dstType, rewriter.getIntegerAttr(dstType, (1 << srcBits) - 1));
-  result = rewriter.create<spirv::BitwiseAndOp>(loc, dstType, result, mask);
+  result =
+      rewriter.createOrFold<spirv::BitwiseAndOp>(loc, dstType, result, mask);
 
   // Apply sign extension on the loading value unconditionally. The signedness
   // semantic is carried in the operator itself, we relies other pattern to
@@ -611,11 +614,11 @@ IntLoadOpPattern::matchAndRewrite(memref::LoadOp loadOp, OpAdaptor adaptor,
   IntegerAttr shiftValueAttr =
       rewriter.getIntegerAttr(dstType, dstBits - srcBits);
   Value shiftValue =
-      rewriter.create<spirv::ConstantOp>(loc, dstType, shiftValueAttr);
-  result = rewriter.create<spirv::ShiftLeftLogicalOp>(loc, dstType, result,
-                                                      shiftValue);
-  result = rewriter.create<spirv::ShiftRightArithmeticOp>(loc, dstType, result,
-                                                          shiftValue);
+      rewriter.createOrFold<spirv::ConstantOp>(loc, dstType, shiftValueAttr);
+  result = rewriter.createOrFold<spirv::ShiftLeftLogicalOp>(loc, dstType,
+                                                            result, shiftValue);
+  result = rewriter.createOrFold<spirv::ShiftRightArithmeticOp>(
+      loc, dstType, result, shiftValue);
 
   rewriter.replaceOp(loadOp, result);
 
@@ -744,11 +747,12 @@ IntStoreOpPattern::matchAndRewrite(memref::StoreOp storeOp, OpAdaptor adaptor,
 
   // Create a mask to clear the destination. E.g., if it is the second i8 in
   // i32, 0xFFFF00FF is created.
-  Value mask = rewriter.create<spirv::ConstantOp>(
+  Value mask = rewriter.createOrFold<spirv::ConstantOp>(
       loc, dstType, rewriter.getIntegerAttr(dstType, (1 << srcBits) - 1));
-  Value clearBitsMask =
-      rewriter.create<spirv::ShiftLeftLogicalOp>(loc, dstType, mask, offset);
-  clearBitsMask = rewriter.create<spirv::NotOp>(loc, dstType, clearBitsMask);
+  Value clearBitsMask = rewriter.createOrFold<spirv::ShiftLeftLogicalOp>(
+      loc, dstType, mask, offset);
+  clearBitsMask =
+      rewriter.createOrFold<spirv::NotOp>(loc, dstType, clearBitsMask);
 
   Value storeVal = shiftValue(loc, adaptor.getValue(), offset, mask, rewriter);
   Value adjustedPtr = adjustAccessChainForBitwidth(typeConverter, accessChainOp,
@@ -910,7 +914,7 @@ LogicalResult ReinterpretCastPattern::matchAndRewrite(
 
     int64_t attrVal = cast<IntegerAttr>(offset.get<Attribute>()).getInt();
     Attribute attr = rewriter.getIntegerAttr(intType, attrVal);
-    return rewriter.create<spirv::ConstantOp>(loc, intType, attr);
+    return rewriter.createOrFold<spirv::ConstantOp>(loc, intType, attr);
   }();
 
   rewriter.replaceOpWithNewOp<spirv::InBoundsPtrAccessChainOp>(
diff --git a/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp b/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp
index 0b4570c65c51..e8bfd4421f5c 100644
--- a/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp
+++ b/mlir/lib/Dialect/Affine/TransformOps/AffineTransformOps.cpp
@@ -13,7 +13,7 @@
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Dialect/Affine/LoopUtils.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
index 3dc5539cde3d..8b8ed2578ca5 100644
--- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
@@ -1792,8 +1792,7 @@ MemRefType mlir::affine::normalizeMemRefType(MemRefType memrefType) {
   MLIRContext *context = memrefType.getContext();
   for (unsigned d = 0; d < newRank; ++d) {
     // Check if this dimension is dynamic.
-    if (bool isDynDim =
-            isNormalizedMemRefDynamicDim(d, layoutMap, memrefTypeDynDims)) {
+    if (isNormalizedMemRefDynamicDim(d, layoutMap, memrefTypeDynDims)) {
       newShape[d] = ShapedType::kDynamic;
       continue;
     }
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
index 8dbf70162012..32f4e6a0fe89 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -182,6 +182,11 @@ parseHeuristicOption(const std::string &s) {
     return OneShotBufferizationOptions::AnalysisHeuristic::BottomUp;
   if (s == "top-down")
     return OneShotBufferizationOptions::AnalysisHeuristic::TopDown;
+  if (s == "bottom-up-from-terminators")
+    return OneShotBufferizationOptions::AnalysisHeuristic::
+        BottomUpFromTerminators;
+  if (s == "fuzzer")
+    return OneShotBufferizationOptions::AnalysisHeuristic::Fuzzer;
   llvm_unreachable("invalid analysisheuristic option");
 }
 
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
index fba9cd873063..531016130d1d 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
@@ -51,6 +51,7 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/Dominance.h"
+#include "mlir/IR/Iterators.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
@@ -1094,41 +1095,104 @@ static void equivalenceAnalysis(Operation *op, OneShotAnalysisState &state) {
   equivalenceAnalysis(ops, state);
 }
 
-LogicalResult OneShotAnalysisState::analyzeOp(Operation *op,
-                                              const DominanceInfo &domInfo) {
-  // Collect ops so we can build our own reverse traversal.
-  SmallVector<Operation *> ops;
-  op->walk([&](Operation *op) {
-    // No tensors => no buffers.
-    if (!hasTensorSemantics(op))
+/// "Bottom-up from terminators" heuristic.
+static SmallVector<Operation *>
+bottomUpFromTerminatorsHeuristic(Operation *op,
+                                 const OneShotAnalysisState &state) {
+  SetVector<Operation *> traversedOps;
+
+  // Find region terminators.
+  op->walk<WalkOrder::PostOrder>([&](RegionBranchTerminatorOpInterface term) {
+    if (!traversedOps.insert(term))
       return;
-    ops.push_back(op);
+    // Follow the reverse SSA use-def chain from each yielded value as long as
+    // we stay within the same region.
+    SmallVector<OpResult> worklist;
+    for (Value v : term->getOperands()) {
+      if (!isa<TensorType>(v.getType()))
+        continue;
+      auto opResult = dyn_cast<OpResult>(v);
+      if (!opResult)
+        continue;
+      worklist.push_back(opResult);
+    }
+    while (!worklist.empty()) {
+      OpResult opResult = worklist.pop_back_val();
+      Operation *defOp = opResult.getDefiningOp();
+      if (!traversedOps.insert(defOp))
+        continue;
+      if (!term->getParentRegion()->findAncestorOpInRegion(*defOp))
+        continue;
+      AliasingOpOperandList aliases = state.getAliasingOpOperands(opResult);
+      for (auto alias : aliases) {
+        Value v = alias.opOperand->get();
+        if (!isa<TensorType>(v.getType()))
+          continue;
+        auto opResult = dyn_cast<OpResult>(v);
+        if (!opResult)
+          continue;
+        worklist.push_back(opResult);
+      }
+    }
   });
 
-  if (getOptions().analysisFuzzerSeed) {
-    // This is a fuzzer. For testing purposes only. Randomize the order in which
-    // operations are analyzed. The bufferization quality is likely worse, but
-    // we want to make sure that no assertions are triggered anywhere.
-    std::mt19937 g(getOptions().analysisFuzzerSeed);
-    llvm::shuffle(ops.begin(), ops.end(), g);
-  }
+  // Analyze traversed ops, then all remaining ops.
+  SmallVector<Operation *> result(traversedOps.begin(), traversedOps.end());
+  op->walk<WalkOrder::PostOrder, ReverseIterator>([&](Operation *op) {
+    if (!traversedOps.contains(op) && hasTensorSemantics(op))
+      result.push_back(op);
+  });
+  return result;
+}
 
+LogicalResult OneShotAnalysisState::analyzeOp(Operation *op,
+                                              const DominanceInfo &domInfo) {
   OneShotBufferizationOptions::AnalysisHeuristic heuristic =
       getOptions().analysisHeuristic;
-  if (heuristic == OneShotBufferizationOptions::AnalysisHeuristic::BottomUp) {
-    // Default: Walk ops in reverse for better interference analysis.
-    for (Operation *op : reverse(ops))
-      if (failed(analyzeSingleOp(op, domInfo)))
-        return failure();
-  } else if (heuristic ==
-             OneShotBufferizationOptions::AnalysisHeuristic::TopDown) {
-    for (Operation *op : ops)
-      if (failed(analyzeSingleOp(op, domInfo)))
-        return failure();
+
+  SmallVector<Operation *> orderedOps;
+  if (heuristic ==
+      OneShotBufferizationOptions::AnalysisHeuristic::BottomUpFromTerminators) {
+    orderedOps = bottomUpFromTerminatorsHeuristic(op, *this);
   } else {
-    llvm_unreachable("unsupported heuristic");
+    op->walk([&](Operation *op) {
+      // No tensors => no buffers.
+      if (!hasTensorSemantics(op))
+        return;
+      orderedOps.push_back(op);
+    });
+    switch (heuristic) {
+    case OneShotBufferizationOptions::AnalysisHeuristic::BottomUp: {
+      // Default: Walk ops in reverse for better interference analysis.
+      std::reverse(orderedOps.begin(), orderedOps.end());
+      break;
+    }
+    case OneShotBufferizationOptions::AnalysisHeuristic::TopDown: {
+      // Ops are already sorted top-down in `orderedOps`.
+      break;
+    }
+    case OneShotBufferizationOptions::AnalysisHeuristic::Fuzzer: {
+      assert(getOptions().analysisFuzzerSeed &&
+             "expected that fuzzer seed it set");
+      // This is a fuzzer. For testing purposes only. Randomize the order in
+      // which operations are analyzed. The bufferization quality is likely
+      // worse, but we want to make sure that no assertions are triggered
+      // anywhere.
+      std::mt19937 g(getOptions().analysisFuzzerSeed);
+      llvm::shuffle(orderedOps.begin(), orderedOps.end(), g);
+      break;
+    }
+    default: {
+      llvm_unreachable("unsupported heuristic");
+    }
+    }
   }
 
+  // Analyze ops in the computed order.
+  for (Operation *op : orderedOps)
+    if (failed(analyzeSingleOp(op, domInfo)))
+      return failure();
+
   equivalenceAnalysis(op, *this);
   return success();
 }
diff --git a/mlir/lib/Dialect/EmitC/Transforms/FormExpressions.cpp b/mlir/lib/Dialect/EmitC/Transforms/FormExpressions.cpp
index 5b03f81b305f..e7c431f39e3f 100644
--- a/mlir/lib/Dialect/EmitC/Transforms/FormExpressions.cpp
+++ b/mlir/lib/Dialect/EmitC/Transforms/FormExpressions.cpp
@@ -36,7 +36,8 @@ struct FormExpressionsPass
     // Wrap each C operator op with an expression op.
     OpBuilder builder(context);
     auto matchFun = [&](Operation *op) {
-      if (op->hasTrait<OpTrait::emitc::CExpression>())
+      if (op->hasTrait<OpTrait::emitc::CExpression>() &&
+          !op->getParentOfType<emitc::ExpressionOp>())
         createExpression(op, builder);
     };
     rootOp->walk(matchFun);
diff --git a/mlir/lib/Dialect/Func/TransformOps/FuncTransformOps.cpp b/mlir/lib/Dialect/Func/TransformOps/FuncTransformOps.cpp
index 9e79b086c0be..1e262736226f 100644
--- a/mlir/lib/Dialect/Func/TransformOps/FuncTransformOps.cpp
+++ b/mlir/lib/Dialect/Func/TransformOps/FuncTransformOps.cpp
@@ -13,8 +13,8 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 using namespace mlir;
diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
index ada985b5979e..fc3a43756945 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -22,7 +22,7 @@
 #include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
diff --git a/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp b/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
index cc3be7da11e0..e8ecbe16c3f0 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
@@ -17,7 +17,7 @@
 #include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/AffineExpr.h"
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
index 443e245887ea..630187f220a4 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
@@ -287,15 +287,22 @@ getPointerDataLayoutEntry(DataLayoutEntryListRef params, LLVMPointerType type,
     }
   }
   if (currentEntry) {
-    return *extractPointerSpecValue(currentEntry, pos) /
-           (pos == PtrDLEntryPos::Size ? 1 : kBitsInByte);
+    std::optional<uint64_t> value = extractPointerSpecValue(currentEntry, pos);
+    // If the optional `PtrDLEntryPos::Index` entry is not available, use the
+    // pointer size as the index bitwidth.
+    if (!value && pos == PtrDLEntryPos::Index)
+      value = extractPointerSpecValue(currentEntry, PtrDLEntryPos::Size);
+    bool isSizeOrIndex =
+        pos == PtrDLEntryPos::Size || pos == PtrDLEntryPos::Index;
+    return *value / (isSizeOrIndex ? 1 : kBitsInByte);
   }
 
   // If not found, and this is the pointer to the default memory space, assume
   // 64-bit pointers.
   if (type.getAddressSpace() == 0) {
-    return pos == PtrDLEntryPos::Size ? kDefaultPointerSizeBits
-                                      : kDefaultPointerAlignment;
+    bool isSizeOrIndex =
+        pos == PtrDLEntryPos::Size || pos == PtrDLEntryPos::Index;
+    return isSizeOrIndex ? kDefaultPointerSizeBits : kDefaultPointerAlignment;
   }
 
   return std::nullopt;
@@ -332,6 +339,16 @@ LLVMPointerType::getPreferredAlignment(const DataLayout &dataLayout,
   return dataLayout.getTypePreferredAlignment(get(getContext()));
 }
 
+std::optional<uint64_t>
+LLVMPointerType::getIndexBitwidth(const DataLayout &dataLayout,
+                                  DataLayoutEntryListRef params) const {
+  if (std::optional<uint64_t> indexBitwidth =
+          getPointerDataLayoutEntry(params, *this, PtrDLEntryPos::Index))
+    return *indexBitwidth;
+
+  return dataLayout.getTypeIndexBitwidth(get(getContext()));
+}
+
 bool LLVMPointerType::areCompatible(DataLayoutEntryListRef oldLayout,
                                     DataLayoutEntryListRef newLayout) const {
   for (DataLayoutEntryInterface newEntry : newLayout) {
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgMatchOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgMatchOps.cpp
index fb18886c16b1..ae2a34bcf3e5 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgMatchOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgMatchOps.cpp
@@ -13,6 +13,7 @@
 #include "mlir/Dialect/Linalg/TransformOps/Syntax.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/Transform/IR/MatchInterfaces.h"
+#include "mlir/Dialect/Transform/IR/TransformTypes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/Interfaces/FunctionImplementation.h"
 #include "llvm/Support/Debug.h"
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index ae28049f02e3..ecf998312482 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -25,9 +25,9 @@
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tensor/Utils/Utils.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
 #include "mlir/Dialect/Transform/IR/TransformTypes.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/Utils/Utils.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
@@ -3269,15 +3269,20 @@ DiagnosedSilenceableFailure transform::FlattenElementwiseLinalgOp::applyToOne(
     transform::ApplyToEachResultList &results,
     transform::TransformState &state) {
   rewriter.setInsertionPoint(target);
-  if (target.getNumLoops() <= 1)
+  if (!isElementwise(target)) {
+    failed(rewriter.notifyMatchFailure(
+        target, "only elementwise flattening is supported"));
+    return emitDefaultSilenceableFailure(target);
+  }
+  // If rank <= 1, do nothing
+  if (target.getNumLoops() <= 1) {
+    results.push_back(target);
     return DiagnosedSilenceableFailure::success();
+  }
   ReassociationIndices reassociation(target.getNumLoops());
   std::iota(reassociation.begin(), reassociation.end(), 0);
   auto maybeFlattened =
-      (isElementwise(target))
-          ? collapseOpIterationDims(target, reassociation, rewriter)
-          : FailureOr<CollapseResult>(rewriter.notifyMatchFailure(
-                target, "only elementwise flattening is supported"));
+      collapseOpIterationDims(target, reassociation, rewriter);
   if (failed(maybeFlattened))
     return emitDefaultSilenceableFailure(target);
   results.push_back(maybeFlattened->collapsedOp);
diff --git a/mlir/lib/Dialect/MemRef/TransformOps/MemRefTransformOps.cpp b/mlir/lib/Dialect/MemRef/TransformOps/MemRefTransformOps.cpp
index 8932d6164182..b3481ce1c56b 100644
--- a/mlir/lib/Dialect/MemRef/TransformOps/MemRefTransformOps.cpp
+++ b/mlir/lib/Dialect/MemRef/TransformOps/MemRefTransformOps.cpp
@@ -19,7 +19,8 @@
 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/IR/TransformTypes.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
diff --git a/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp b/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp
index bc2fe5772af9..4d8d93f7aac7 100644
--- a/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp
+++ b/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp
@@ -17,8 +17,8 @@
 #include "mlir/Dialect/SCF/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/Utils/Utils.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinAttributes.h"
diff --git a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
index 2b79c8022b8e..4072608dc8f8 100644
--- a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
+++ b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp
@@ -991,15 +991,16 @@ Value mlir::spirv::linearizeIndex(ValueRange indices, ArrayRef<int64_t> strides,
   // broken down into progressive small steps so we can have intermediate steps
   // using other dialects. At the moment SPIR-V is the final sink.
 
-  Value linearizedIndex = builder.create<spirv::ConstantOp>(
+  Value linearizedIndex = builder.createOrFold<spirv::ConstantOp>(
       loc, integerType, IntegerAttr::get(integerType, offset));
   for (const auto &index : llvm::enumerate(indices)) {
-    Value strideVal = builder.create<spirv::ConstantOp>(
+    Value strideVal = builder.createOrFold<spirv::ConstantOp>(
         loc, integerType,
         IntegerAttr::get(integerType, strides[index.index()]));
-    Value update = builder.create<spirv::IMulOp>(loc, strideVal, index.value());
+    Value update =
+        builder.createOrFold<spirv::IMulOp>(loc, index.value(), strideVal);
     linearizedIndex =
-        builder.create<spirv::IAddOp>(loc, linearizedIndex, update);
+        builder.createOrFold<spirv::IAddOp>(loc, update, linearizedIndex);
   }
   return linearizedIndex;
 }
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp b/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp
index 5ca9510408c3..4b3156728cc9 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorDialect.cpp
@@ -11,7 +11,7 @@
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Interfaces/SubsetOpInterface.h"
 #include "mlir/Transforms/InliningUtils.h"
 
diff --git a/mlir/lib/Dialect/Tensor/TransformOps/TensorTransformOps.cpp b/mlir/lib/Dialect/Tensor/TransformOps/TensorTransformOps.cpp
index 38f1824a3634..5c6a32ce9a68 100644
--- a/mlir/lib/Dialect/Tensor/TransformOps/TensorTransformOps.cpp
+++ b/mlir/lib/Dialect/Tensor/TransformOps/TensorTransformOps.cpp
@@ -14,7 +14,7 @@
 #include "mlir/Dialect/Tensor/Transforms/Transforms.h"
 #include "mlir/Dialect/Tensor/Utils/Utils.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/Transforms/DialectConversion.h"
 
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaFolders.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaFolders.cpp
index 050f8ca3f32a..6575b39fd45a 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaFolders.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaFolders.cpp
@@ -132,14 +132,17 @@ bool constantUnaryOpShouldBeFolded(TosaOp unaryOp, DenseElementsAttr values) {
   return inputOp.hasOneUse();
 }
 
-template <typename BaseType>
-DenseElementsAttr transposeType(ElementsAttr attr, ShapedType inputType,
+template <typename RangeType>
+DenseElementsAttr transposeType(const RangeType &data, ShapedType inputType,
                                 ShapedType outputType,
                                 llvm::ArrayRef<int64_t> permValues) {
+  using ElementType = std::decay_t<decltype(*std::begin(data))>;
+
+  assert(inputType.getElementType() == outputType.getElementType());
+
   if (inputType.getNumElements() == 0)
-    return DenseElementsAttr::get(outputType, llvm::ArrayRef<BaseType>{});
+    return DenseElementsAttr::get(outputType, llvm::ArrayRef<ElementType>{});
 
-  auto attrValues = attr.getValues<BaseType>();
   auto inputShape = inputType.getShape();
 
   // The inverted permutation map and strides of the output are used to compute
@@ -148,10 +151,11 @@ DenseElementsAttr transposeType(ElementsAttr attr, ShapedType inputType,
   auto outputStrides = computeStrides(outputType.getShape());
   auto invertedPermValues = invertPermutationVector(permValues);
 
-  auto initialValue = *std::begin(attrValues);
-  SmallVector<BaseType> outputValues(inputType.getNumElements(), initialValue);
+  auto initialValue = *std::begin(data);
+  SmallVector<ElementType> outputValues(inputType.getNumElements(),
+                                        initialValue);
 
-  for (const auto &it : llvm::enumerate(attrValues)) {
+  for (const auto &it : llvm::enumerate(data)) {
     auto srcLinearIndex = it.index();
 
     uint64_t dstLinearIndex = 0;
@@ -170,7 +174,7 @@ DenseElementsAttr transposeType(ElementsAttr attr, ShapedType inputType,
   }
 
   return DenseElementsAttr::get(outputType,
-                                llvm::ArrayRef<BaseType>(outputValues));
+                                llvm::ArrayRef<ElementType>(outputValues));
 }
 
 // A type specialized transposition of an ElementsAttr.
@@ -180,32 +184,28 @@ DenseElementsAttr transposeType(ElementsAttr attr, ShapedType inputType,
 DenseElementsAttr transpose(ElementsAttr attr, ShapedType inputType,
                             ShapedType outputType,
                             llvm::ArrayRef<int64_t> permValues) {
-  auto baseType = inputType.getElementType();
-
-  // Handle possible integer types
-  if (auto intType = dyn_cast<IntegerType>(baseType)) {
-    switch (intType.getWidth()) {
-    case 1:
-      return transposeType<bool>(attr, inputType, outputType, permValues);
-    case 8:
-      return transposeType<int8_t>(attr, inputType, outputType, permValues);
-    case 16:
-      return transposeType<int16_t>(attr, inputType, outputType, permValues);
-    case 32:
-      return transposeType<int32_t>(attr, inputType, outputType, permValues);
-    case 64:
-      return transposeType<int64_t>(attr, inputType, outputType, permValues);
-    default:
-      return transposeType<APInt>(attr, inputType, outputType, permValues);
-    }
-  }
+  if (auto data = attr.tryGetValues<bool>())
+    return transposeType(*data, inputType, outputType, permValues);
 
-  // Handle possible float types
-  if (baseType.isF32()) {
-    return transposeType<float>(attr, inputType, outputType, permValues);
-  }
+  if (auto data = attr.tryGetValues<int8_t>())
+    return transposeType(*data, inputType, outputType, permValues);
+
+  if (auto data = attr.tryGetValues<int16_t>())
+    return transposeType(*data, inputType, outputType, permValues);
+
+  if (auto data = attr.tryGetValues<int32_t>())
+    return transposeType(*data, inputType, outputType, permValues);
 
-  return transposeType<APFloat>(attr, inputType, outputType, permValues);
+  if (auto data = attr.tryGetValues<int64_t>())
+    return transposeType(*data, inputType, outputType, permValues);
+
+  if (auto data = attr.tryGetValues<float>())
+    return transposeType(*data, inputType, outputType, permValues);
+
+  if (auto data = attr.tryGetValues<APFloat>())
+    return transposeType(*data, inputType, outputType, permValues);
+
+  return nullptr;
 }
 
 struct TosaFoldConstantTranspose : public OpRewritePattern<tosa::TransposeOp> {
@@ -228,14 +228,19 @@ struct TosaFoldConstantTranspose : public OpRewritePattern<tosa::TransposeOp> {
     DenseIntElementsAttr permAttr;
     if (!matchPattern(op.getPerms(), m_Constant(&permAttr)))
       return failure();
-    auto permValues = llvm::to_vector<6>(llvm::map_range(
+    auto permValues = llvm::map_to_vector(
         // TOSA allows both 32- and 64-bit integer tensors here.
         permAttr.getValues<APInt>(),
-        [](const APInt &val) { return val.getSExtValue(); }));
+        [](const APInt &val) { return val.getSExtValue(); });
 
     auto inputType = cast<ShapedType>(op.getInput1().getType());
 
     auto resultAttr = transpose(inputValues, inputType, outputType, permValues);
+    if (!resultAttr) {
+      return rewriter.notifyMatchFailure(
+          op, "unsupported attribute or element type");
+    }
+
     rewriter.replaceOpWithNewOp<tosa::ConstOp>(op, outputType, resultAttr);
     return success();
   }
diff --git a/mlir/lib/Dialect/Transform/CMakeLists.txt b/mlir/lib/Dialect/Transform/CMakeLists.txt
index ed05194b7299..64115dcc29d6 100644
--- a/mlir/lib/Dialect/Transform/CMakeLists.txt
+++ b/mlir/lib/Dialect/Transform/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_subdirectory(DebugExtension)
+add_subdirectory(Interfaces)
 add_subdirectory(IR)
 add_subdirectory(LoopExtension)
 add_subdirectory(PDLExtension)
diff --git a/mlir/lib/Dialect/Transform/IR/CMakeLists.txt b/mlir/lib/Dialect/Transform/IR/CMakeLists.txt
index 34083b2fd7aa..f90ac089adaa 100644
--- a/mlir/lib/Dialect/Transform/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/Transform/IR/CMakeLists.txt
@@ -2,7 +2,6 @@ add_mlir_dialect_library(MLIRTransformDialect
   MatchInterfaces.cpp
   TransformAttrs.cpp
   TransformDialect.cpp
-  TransformInterfaces.cpp
   TransformOps.cpp
   TransformTypes.cpp
   Utils.cpp
@@ -10,7 +9,6 @@ add_mlir_dialect_library(MLIRTransformDialect
   DEPENDS
   MLIRMatchInterfacesIncGen
   MLIRTransformDialectIncGen
-  MLIRTransformInterfacesIncGen
 
   LINK_LIBS PUBLIC
   MLIRCastInterfaces
@@ -24,5 +22,6 @@ add_mlir_dialect_library(MLIRTransformDialect
   MLIRRewrite
   MLIRSideEffectInterfaces
   MLIRTransforms
+  MLIRTransformDialectInterfaces
   MLIRTransformDialectUtils
   )
diff --git a/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp b/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp
index fb355bc97192..e628430ff861 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp
@@ -8,10 +8,10 @@
 
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Analysis/CallGraph.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
 #include "mlir/Dialect/Transform/IR/TransformTypes.h"
 #include "mlir/Dialect/Transform/IR/Utils.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/DialectImplementation.h"
 #include "llvm/ADT/SCCIterator.h"
 
@@ -178,7 +178,8 @@ LogicalResult transform::TransformDialect::verifyOperationAttribute(
     }
     return success();
   }
-  if (attribute.getName().getValue() == kSilenceTrackingFailuresAttrName) {
+  if (attribute.getName().getValue() ==
+      FindPayloadReplacementOpInterface::kSilenceTrackingFailuresAttrName) {
     if (!llvm::isa<UnitAttr>(attribute.getValue())) {
       return op->emitError()
              << attribute.getName() << " must be a unit attribute";
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index ca80899ab073..8d2ed8f6d737 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -14,8 +14,8 @@
 #include "mlir/Dialect/Transform/IR/MatchInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformAttrs.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformTypes.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/Dominance.h"
diff --git a/mlir/lib/Dialect/Transform/IR/TransformTypes.cpp b/mlir/lib/Dialect/Transform/IR/TransformTypes.cpp
index 5f70235c2352..8d9f105d1c5d 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformTypes.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformTypes.cpp
@@ -8,7 +8,7 @@
 
 #include "mlir/Dialect/Transform/IR/TransformTypes.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/DialectImplementation.h"
@@ -19,8 +19,6 @@
 
 using namespace mlir;
 
-#include "mlir/Dialect/Transform/IR/TransformTypeInterfaces.cpp.inc"
-
 // These are automatically generated by ODS but are not used as the Transform
 // dialect uses a different dispatch mechanism to support dialect extensions.
 LLVM_ATTRIBUTE_UNUSED static OptionalParseResult
diff --git a/mlir/lib/Dialect/Transform/Interfaces/CMakeLists.txt b/mlir/lib/Dialect/Transform/Interfaces/CMakeLists.txt
new file mode 100644
index 000000000000..7b837bde0625
--- /dev/null
+++ b/mlir/lib/Dialect/Transform/Interfaces/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_mlir_library(MLIRTransformDialectInterfaces
+  TransformInterfaces.cpp
+
+  DEPENDS
+  MLIRTransformInterfacesIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRCastInterfaces
+  MLIRIR
+  MLIRRewrite
+  MLIRSideEffectInterfaces
+  MLIRTransforms
+  MLIRTransformDialectUtils
+)
+
diff --git a/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp b/mlir/lib/Dialect/Transform/Interfaces/TransformInterfaces.cpp
index fe2eea535ffd..48f3954b6cf6 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp
+++ b/mlir/lib/Dialect/Transform/Interfaces/TransformInterfaces.cpp
@@ -6,10 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 
-#include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformTypes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
@@ -951,8 +949,8 @@ transform::TransformState::applyTransform(TransformOpInterface transform) {
   DiagnosedSilenceableFailure trackingFailure =
       trackingListener.checkAndResetError();
   if (!transform->hasTrait<ReportTrackingListenerFailuresOpTrait>() ||
-      transform->hasAttr(
-          transform::TransformDialect::kSilenceTrackingFailuresAttrName)) {
+      transform->hasAttr(FindPayloadReplacementOpInterface::
+                             kSilenceTrackingFailuresAttrName)) {
     // Only report failures for ReportTrackingListenerFailuresOpTrait ops. Also
     // do not report failures if the above mentioned attribute is set.
     if (trackingFailure.isSilenceableFailure())
@@ -1649,23 +1647,7 @@ LogicalResult transform::detail::mapPossibleTopLevelTransformOpBlockArguments(
              << " were provided to the interpreter";
     }
 
-    // Top-level transforms can be used for matching. If no concrete operation
-    // type is specified, the block argument is mapped to the top-level op.
-    // Otherwise, it is mapped to all ops of the specified type within the
-    // top-level op (including the top-level op itself). Once an op is added as
-    // a target, its descendants are not explored any further.
-    BlockArgument bbArg = region.front().getArgument(0);
-    if (auto bbArgType = dyn_cast<transform::OperationType>(bbArg.getType())) {
-      state.getTopLevel()->walk<WalkOrder::PreOrder>([&](Operation *op) {
-        if (op->getName().getStringRef() == bbArgType.getOperationName()) {
-          targets.push_back(op);
-          return WalkResult::skip();
-        }
-        return WalkResult::advance();
-      });
-    } else {
-      targets.push_back(state.getTopLevel());
-    }
+    targets.push_back(state.getTopLevel());
 
     for (unsigned i = 0, e = state.getNumTopLevelMappings(); i < e; ++i)
       extraMappings.push_back(llvm::to_vector(state.getTopLevelMapping(i)));
@@ -2003,4 +1985,5 @@ LogicalResult transform::applyTransforms(
 // Generated interface implementation.
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.cpp.inc"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.cpp.inc"
+#include "mlir/Dialect/Transform/Interfaces/TransformTypeInterfaces.cpp.inc"
diff --git a/mlir/lib/Dialect/Transform/Transforms/CheckUses.cpp b/mlir/lib/Dialect/Transform/Transforms/CheckUses.cpp
index 45fa644f42ec..561d3d5b05af 100644
--- a/mlir/lib/Dialect/Transform/Transforms/CheckUses.cpp
+++ b/mlir/lib/Dialect/Transform/Transforms/CheckUses.cpp
@@ -13,7 +13,7 @@
 
 #include "mlir/Dialect/Transform/Transforms/Passes.h"
 
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Pass/Pass.h"
 #include "llvm/ADT/SetOperations.h"
diff --git a/mlir/lib/Dialect/Transform/Transforms/InferEffects.cpp b/mlir/lib/Dialect/Transform/Transforms/InferEffects.cpp
index 281c1b9f8fdb..20db09ca9e8d 100644
--- a/mlir/lib/Dialect/Transform/Transforms/InferEffects.cpp
+++ b/mlir/lib/Dialect/Transform/Transforms/InferEffects.cpp
@@ -9,7 +9,7 @@
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Dialect/Transform/Transforms/Passes.h"
 
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/Visitors.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
diff --git a/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp b/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp
index 7adf223f3440..19906f15ae85 100644
--- a/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp
+++ b/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/Transforms/Passes.h"
 #include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
 
diff --git a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterPassBase.cpp b/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterPassBase.cpp
index a2f9e502e723..efb9359e1995 100644
--- a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterPassBase.cpp
+++ b/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterPassBase.cpp
@@ -13,9 +13,9 @@
 
 #include "mlir/Dialect/Transform/Transforms/TransformInterpreterPassBase.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
 #include "mlir/Dialect/Transform/IR/Utils.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Verifier.h"
diff --git a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp b/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp
index 8a9cd7c52d82..232c9c96dd09 100644
--- a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp
+++ b/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp
@@ -12,9 +12,9 @@
 
 #include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
 #include "mlir/Dialect/Transform/IR/Utils.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/IR/Visitors.h"
diff --git a/mlir/lib/Dialect/Vector/IR/CMakeLists.txt b/mlir/lib/Dialect/Vector/IR/CMakeLists.txt
index 70f3fa8c297d..204462ffd047 100644
--- a/mlir/lib/Dialect/Vector/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/Vector/IR/CMakeLists.txt
@@ -1,5 +1,7 @@
 add_mlir_dialect_library(MLIRVectorDialect
   VectorOps.cpp
+  ValueBoundsOpInterfaceImpl.cpp
+  ScalableValueBoundsConstraintSet.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Vector/IR
diff --git a/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp b/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp
new file mode 100644
index 000000000000..6d7e3bc70f59
--- /dev/null
+++ b/mlir/lib/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.cpp
@@ -0,0 +1,103 @@
+//===- ScalableValueBoundsConstraintSet.cpp - Scalable Value Bounds -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.h"
+
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+
+namespace mlir::vector {
+
+FailureOr<ConstantOrScalableBound::BoundSize>
+ConstantOrScalableBound::getSize() const {
+  if (map.isSingleConstant())
+    return BoundSize{map.getSingleConstantResult(), /*scalable=*/false};
+  if (map.getNumResults() != 1 || map.getNumInputs() != 1)
+    return failure();
+  auto binop = dyn_cast<AffineBinaryOpExpr>(map.getResult(0));
+  if (!binop || binop.getKind() != AffineExprKind::Mul)
+    return failure();
+  auto matchConstant = [&](AffineExpr expr, int64_t &constant) -> bool {
+    if (auto cst = dyn_cast<AffineConstantExpr>(expr)) {
+      constant = cst.getValue();
+      return true;
+    }
+    return false;
+  };
+  // Match `s0 * cst` or `cst * s0`:
+  int64_t cst = 0;
+  auto lhs = binop.getLHS();
+  auto rhs = binop.getRHS();
+  if ((matchConstant(lhs, cst) && isa<AffineSymbolExpr>(rhs)) ||
+      (matchConstant(rhs, cst) && isa<AffineSymbolExpr>(lhs))) {
+    return BoundSize{cst, /*scalable=*/true};
+  }
+  return failure();
+}
+
+char ScalableValueBoundsConstraintSet::ID = 0;
+
+FailureOr<ConstantOrScalableBound>
+ScalableValueBoundsConstraintSet::computeScalableBound(
+    Value value, std::optional<int64_t> dim, unsigned vscaleMin,
+    unsigned vscaleMax, presburger::BoundType boundType, bool closedUB,
+    StopConditionFn stopCondition) {
+  using namespace presburger;
+
+  assert(vscaleMin <= vscaleMax);
+  ScalableValueBoundsConstraintSet scalableCstr(value.getContext(), vscaleMin,
+                                                vscaleMax);
+
+  int64_t pos = scalableCstr.populateConstraintsSet(value, dim, stopCondition);
+
+  // Project out all variables apart from vscale.
+  // This should result in constraints in terms of vscale only.
+  scalableCstr.projectOut(
+      [&](ValueDim p) { return p.first != scalableCstr.getVscaleValue(); });
+
+  assert(scalableCstr.cstr.getNumDimAndSymbolVars() ==
+             scalableCstr.positionToValueDim.size() &&
+         "inconsistent mapping state");
+
+  // Check that the only symbols left are vscale.
+  for (int64_t i = 0; i < scalableCstr.cstr.getNumDimAndSymbolVars(); ++i) {
+    if (i == pos)
+      continue;
+    if (scalableCstr.positionToValueDim[i] !=
+        ValueDim(scalableCstr.getVscaleValue(),
+                 ValueBoundsConstraintSet::kIndexValue)) {
+      return failure();
+    }
+  }
+
+  SmallVector<AffineMap, 1> lowerBound(1), upperBound(1);
+  scalableCstr.cstr.getSliceBounds(pos, 1, value.getContext(), &lowerBound,
+                                   &upperBound, closedUB);
+
+  auto invalidBound = [](auto &bound) {
+    return !bound[0] || bound[0].getNumResults() != 1;
+  };
+
+  AffineMap bound = [&] {
+    if (boundType == BoundType::EQ && !invalidBound(lowerBound) &&
+        lowerBound[0] == lowerBound[0]) {
+      return lowerBound[0];
+    } else if (boundType == BoundType::LB && !invalidBound(lowerBound)) {
+      return lowerBound[0];
+    } else if (boundType == BoundType::UB && !invalidBound(upperBound)) {
+      return upperBound[0];
+    }
+    return AffineMap{};
+  }();
+
+  if (!bound)
+    return failure();
+
+  return ConstantOrScalableBound{bound};
+}
+
+} // namespace mlir::vector
diff --git a/mlir/lib/Dialect/Vector/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/Vector/IR/ValueBoundsOpInterfaceImpl.cpp
new file mode 100644
index 000000000000..ca95072d9bb0
--- /dev/null
+++ b/mlir/lib/Dialect/Vector/IR/ValueBoundsOpInterfaceImpl.cpp
@@ -0,0 +1,51 @@
+//===- ValueBoundsOpInterfaceImpl.cpp - Impl. of ValueBoundsOpInterface ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Vector/IR/ValueBoundsOpInterfaceImpl.h"
+
+#include "mlir/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
+
+using namespace mlir;
+
+namespace mlir::vector {
+namespace {
+
+struct VectorScaleOpInterface
+    : public ValueBoundsOpInterface::ExternalModel<VectorScaleOpInterface,
+                                                   VectorScaleOp> {
+  void populateBoundsForIndexValue(Operation *op, Value value,
+                                   ValueBoundsConstraintSet &cstr) const {
+    auto *scalableCstr = dyn_cast<ScalableValueBoundsConstraintSet>(&cstr);
+    if (!scalableCstr)
+      return;
+    auto vscaleOp = cast<VectorScaleOp>(op);
+    assert(value == vscaleOp.getResult() && "invalid value");
+    if (auto vscale = scalableCstr->getVscaleValue()) {
+      // All copies of vscale are equivalent.
+      scalableCstr->bound(value) == cstr.getExpr(vscale);
+    } else {
+      // We know vscale is confined to [vscaleMin, vscaleMax].
+      scalableCstr->bound(value) >= scalableCstr->getVscaleMin();
+      scalableCstr->bound(value) <= scalableCstr->getVscaleMax();
+      scalableCstr->setVscale(vscaleOp);
+    }
+  }
+};
+
+} // namespace
+} // namespace mlir::vector
+
+void mlir::vector::registerValueBoundsOpInterfaceExternalModels(
+    DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, vector::VectorDialect *dialect) {
+    vector::VectorScaleOp::attachInterface<vector::VectorScaleOpInterface>(
+        *ctx);
+  });
+}
diff --git a/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp b/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp
index 6c2cfd8833dd..885644864c0f 100644
--- a/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp
+++ b/mlir/lib/Dialect/Vector/TransformOps/VectorTransformOps.cpp
@@ -13,8 +13,8 @@
 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
index 0eaf9f71a37d..ba1c96805ff8 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
@@ -41,7 +41,6 @@ using namespace mlir::vector;
 //===----------------------------------------------------------------------===//
 // Helper functions
 //===----------------------------------------------------------------------===//
-
 // Helper to find an index in an affine map.
 static std::optional<int64_t> getResultIndex(AffineMap map, int64_t index) {
   for (int64_t i = 0, e = map.getNumResults(); i < e; ++i) {
@@ -226,9 +225,9 @@ namespace {
 /// This only kicks in when VectorTransformsOptions is set to OuterProduct and
 /// the vector.contract op is a row-major matrix multiply.
 class ContractionOpToMatmulOpLowering
-    : public OpRewritePattern<vector::ContractionOp> {
+    : public vector::MaskableOpRewritePattern<vector::ContractionOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using MaskableOpRewritePattern::MaskableOpRewritePattern;
 
   using FilterConstraintType =
       std::function<LogicalResult(vector::ContractionOp op)>;
@@ -241,12 +240,13 @@ public:
       vector::VectorTransformsOptions vectorTransformOptions,
       MLIRContext *context, PatternBenefit benefit = 1,
       FilterConstraintType constraint = defaultFilter)
-      : OpRewritePattern<vector::ContractionOp>(context, benefit),
+      : MaskableOpRewritePattern<vector::ContractionOp>(context, benefit),
         vectorTransformOptions(vectorTransformOptions),
         filter(std::move(constraint)) {}
 
-  LogicalResult matchAndRewrite(vector::ContractionOp op,
-                                PatternRewriter &rewriter) const override;
+  FailureOr<Value>
+  matchAndRewriteMaskableOp(vector::ContractionOp op, MaskingOpInterface maskOp,
+                            PatternRewriter &rewriter) const override;
 
 private:
   /// Options to control the vector patterns.
@@ -270,9 +270,9 @@ private:
 /// This only kicks in when VectorTransformsOptions is set to OuterProduct and
 /// the vector.contract op is a row-major matrix multiply.
 class ContractionOpToOuterProductOpLowering
-    : public OpRewritePattern<vector::ContractionOp> {
+    : public MaskableOpRewritePattern<vector::ContractionOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using MaskableOpRewritePattern::MaskableOpRewritePattern;
 
   using FilterConstraintType =
       std::function<LogicalResult(vector::ContractionOp op)>;
@@ -285,12 +285,13 @@ public:
       vector::VectorTransformsOptions vectorTransformOptions,
       MLIRContext *context, PatternBenefit benefit = 1,
       FilterConstraintType constraint = defaultFilter)
-      : OpRewritePattern<vector::ContractionOp>(context, benefit),
+      : MaskableOpRewritePattern<vector::ContractionOp>(context, benefit),
         vectorTransformOptions(vectorTransformOptions),
         filter(std::move(constraint)) {}
 
-  LogicalResult matchAndRewrite(vector::ContractionOp op,
-                                PatternRewriter &rewriter) const override;
+  FailureOr<Value>
+  matchAndRewriteMaskableOp(vector::ContractionOp op, MaskingOpInterface maskOp,
+                            PatternRewriter &rewriter) const override;
 
 private:
   /// Options to control the vector patterns.
@@ -317,9 +318,9 @@ private:
 /// This only kicks in when VectorTransformsOptions is set to Dot and
 /// the vector.contract op is a row-major matmul or matvec.
 class ContractionOpToDotLowering
-    : public OpRewritePattern<vector::ContractionOp> {
+    : public MaskableOpRewritePattern<vector::ContractionOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using MaskableOpRewritePattern::MaskableOpRewritePattern;
 
   using FilterConstraintType =
       std::function<LogicalResult(vector::ContractionOp op)>;
@@ -332,11 +333,12 @@ public:
       vector::VectorTransformsOptions vectorTransformOptions,
       MLIRContext *context, PatternBenefit benefit = 1,
       const FilterConstraintType &constraint = defaultFilter)
-      : OpRewritePattern<vector::ContractionOp>(context, benefit),
+      : MaskableOpRewritePattern<vector::ContractionOp>(context, benefit),
         vectorTransformOptions(vectorTransformOptions), filter(defaultFilter) {}
 
-  LogicalResult matchAndRewrite(vector::ContractionOp op,
-                                PatternRewriter &rewriter) const override;
+  FailureOr<Value>
+  matchAndRewriteMaskableOp(vector::ContractionOp op, MaskingOpInterface maskOp,
+                            PatternRewriter &rewriter) const override;
 
 private:
   /// Options to control the vector patterns.
@@ -358,9 +360,10 @@ private:
 ///
 /// This only kicks in when either VectorTransformsOptions is set
 /// to Dot or when other contraction patterns fail.
-class ContractionOpLowering : public OpRewritePattern<vector::ContractionOp> {
+class ContractionOpLowering
+    : public MaskableOpRewritePattern<vector::ContractionOp> {
 public:
-  using OpRewritePattern::OpRewritePattern;
+  using MaskableOpRewritePattern::MaskableOpRewritePattern;
   using FilterConstraintType =
       std::function<LogicalResult(vector::ContractionOp op)>;
 
@@ -371,12 +374,13 @@ public:
   ContractionOpLowering(vector::VectorTransformsOptions vectorTransformOptions,
                         MLIRContext *context, PatternBenefit benefit = 1,
                         FilterConstraintType constraint = defaultFilter)
-      : OpRewritePattern<vector::ContractionOp>(context, benefit),
+      : MaskableOpRewritePattern<vector::ContractionOp>(context, benefit),
         vectorTransformOptions(vectorTransformOptions),
         filter(std::move(constraint)) {}
 
-  LogicalResult matchAndRewrite(vector::ContractionOp op,
-                                PatternRewriter &rewriter) const override;
+  FailureOr<Value>
+  matchAndRewriteMaskableOp(vector::ContractionOp op, MaskingOpInterface maskOp,
+                            PatternRewriter &rewriter) const override;
 
 private:
   /// Options to control the vector patterns.
@@ -634,8 +638,10 @@ private:
 ///
 /// This only kicks in when VectorTransformsOptions is set to OuterProduct but
 /// otherwise supports any layout permutation of the matrix-multiply.
-LogicalResult ContractionOpToOuterProductOpLowering::matchAndRewrite(
-    vector::ContractionOp op, PatternRewriter &rewriter) const {
+FailureOr<Value>
+ContractionOpToOuterProductOpLowering::matchAndRewriteMaskableOp(
+    vector::ContractionOp op, MaskingOpInterface maskOp,
+    PatternRewriter &rewriter) const {
   if (vectorTransformOptions.vectorContractLowering !=
       vector::VectorContractLowering::OuterProduct)
     return failure();
@@ -643,43 +649,25 @@ LogicalResult ContractionOpToOuterProductOpLowering::matchAndRewrite(
   if (failed(filter(op)))
     return failure();
 
-  // Vector mask setup.
-  OpBuilder::InsertionGuard guard(rewriter);
-  auto maskableOp = cast<vector::MaskableOpInterface>(op.getOperation());
-  Operation *rootOp;
-  if (maskableOp.isMasked()) {
-    rewriter.setInsertionPoint(maskableOp.getMaskingOp());
-    rootOp = maskableOp.getMaskingOp();
-  } else {
-    rootOp = op;
-  }
-
   UnrolledOuterProductGenerator e(rewriter, op);
   FailureOr<Value> matmatRes = e.matmat();
   if (succeeded(matmatRes)) {
-    rewriter.replaceOp(rootOp, *matmatRes);
-    return success();
+    return matmatRes;
   }
   FailureOr<Value> matvecRes = e.matvec();
   if (succeeded(matvecRes)) {
-    rewriter.replaceOp(rootOp, *matvecRes);
-    return success();
-  }
-  FailureOr<Value> tmatvecRes = e.tmatvec();
-  if (succeeded(tmatvecRes)) {
-    rewriter.replaceOp(rootOp, *tmatvecRes);
-    return success();
+    return matvecRes;
   }
 
-  return failure();
+  FailureOr<Value> tmatvecRes = e.tmatvec();
+  return tmatvecRes;
 }
 
-LogicalResult
-ContractionOpToDotLowering::matchAndRewrite(vector::ContractionOp op,
-                                            PatternRewriter &rewriter) const {
+FailureOr<Value> ContractionOpToDotLowering::matchAndRewriteMaskableOp(
+    vector::ContractionOp op, MaskingOpInterface maskOp,
+    PatternRewriter &rewriter) const {
   // TODO: Support vector.mask.
-  auto maskableOp = cast<MaskableOpInterface>(op.getOperation());
-  if (maskableOp.isMasked())
+  if (maskOp)
     return failure();
 
   if (failed(filter(op)))
@@ -788,15 +776,14 @@ ContractionOpToDotLowering::matchAndRewrite(vector::ContractionOp op,
   }
   if (auto acc = op.getAcc())
     res = createAdd(op.getLoc(), res, acc, isInt, rewriter);
-  rewriter.replaceOp(op, res);
-  return success();
+  return res;
 }
 
 /// Lower vector.contract with all size one reduction dimensions to
 /// elementwise ops when possible.
 struct ContractOpToElementwise
-    : public OpRewritePattern<vector::ContractionOp> {
-  using OpRewritePattern::OpRewritePattern;
+    : public MaskableOpRewritePattern<vector::ContractionOp> {
+  using MaskableOpRewritePattern::MaskableOpRewritePattern;
   using FilterConstraintType =
       std::function<LogicalResult(vector::ContractionOp op)>;
   static LogicalResult defaultFilter(vector::ContractionOp op) {
@@ -806,14 +793,15 @@ struct ContractOpToElementwise
       vector::VectorTransformsOptions vectorTransformOptions,
       MLIRContext *context, PatternBenefit benefit = 1,
       const FilterConstraintType &constraint = defaultFilter)
-      : OpRewritePattern<vector::ContractionOp>(context, benefit),
+      : MaskableOpRewritePattern<vector::ContractionOp>(context, benefit),
         vectorTransformOptions(vectorTransformOptions), filter(defaultFilter) {}
 
-  LogicalResult matchAndRewrite(vector::ContractionOp contractOp,
-                                PatternRewriter &rewriter) const override {
+  FailureOr<Value>
+  matchAndRewriteMaskableOp(vector::ContractionOp contractOp,
+                            MaskingOpInterface maskOp,
+                            PatternRewriter &rewriter) const override {
     // TODO: Support vector.mask.
-    auto maskableOp = cast<MaskableOpInterface>(contractOp.getOperation());
-    if (maskableOp.isMasked())
+    if (maskOp)
       return failure();
 
     if (failed(filter(contractOp)))
@@ -903,8 +891,10 @@ struct ContractOpToElementwise
     std::optional<Value> result =
         createContractArithOp(loc, newLhs, newRhs, contractOp.getAcc(),
                               contractOp.getKind(), rewriter, isInt);
-    rewriter.replaceOp(contractOp, {*result});
-    return success();
+    if (result)
+      return *result;
+
+    return failure();
   }
 
 private:
@@ -930,9 +920,9 @@ private:
 // TODO: break down into transpose/reshape/cast ops
 //               when they become available to avoid code dup
 // TODO: investigate lowering order impact on performance
-LogicalResult
-ContractionOpLowering::matchAndRewrite(vector::ContractionOp op,
-                                       PatternRewriter &rewriter) const {
+FailureOr<Value> ContractionOpLowering::matchAndRewriteMaskableOp(
+    vector::ContractionOp op, MaskingOpInterface maskOp,
+    PatternRewriter &rewriter) const {
   if (failed(filter(op)))
     return failure();
 
@@ -951,29 +941,36 @@ ContractionOpLowering::matchAndRewrite(vector::ContractionOp op,
 
   // TODO: implement benefits, cost models.
   MLIRContext *ctx = op.getContext();
+
   ContractionOpToMatmulOpLowering pat1(vectorTransformOptions, ctx);
-  if (succeeded(pat1.matchAndRewrite(op, rewriter)))
-    return success();
+  FailureOr<Value> newVal1 =
+      pat1.matchAndRewriteMaskableOp(op, maskOp, rewriter);
+  if (!failed(newVal1))
+    return newVal1;
+
   ContractionOpToOuterProductOpLowering pat2(vectorTransformOptions, ctx);
-  if (succeeded(pat2.matchAndRewrite(op, rewriter)))
-    return success();
+  FailureOr<Value> newVal2 =
+      pat2.matchAndRewriteMaskableOp(op, maskOp, rewriter);
+  if (!failed(newVal2))
+    return newVal2;
+
   ContractionOpToDotLowering pat3(vectorTransformOptions, ctx);
-  if (succeeded(pat3.matchAndRewrite(op, rewriter)))
-    return success();
+  FailureOr<Value> newVal3 =
+      pat3.matchAndRewriteMaskableOp(op, maskOp, rewriter);
+  if (!failed(newVal3))
+    return newVal3;
+
   ContractOpToElementwise pat4(vectorTransformOptions, ctx);
-  if (succeeded(pat4.matchAndRewrite(op, rewriter)))
-    return success();
+  FailureOr<Value> newVal4 =
+      pat4.matchAndRewriteMaskableOp(op, maskOp, rewriter);
+  if (!failed(newVal4))
+    return newVal4;
 
   // Vector mask setup.
-  OpBuilder::InsertionGuard guard(rewriter);
-  Operation *rootOp = op;
-  Value mask;
-  if (op.isMasked()) {
-    rewriter.setInsertionPoint(op.getMaskingOp());
-    rootOp = op.getMaskingOp();
-    mask = op.getMaskingOp().getMask();
-  }
 
+  Value mask;
+  if (maskOp)
+    mask = maskOp.getMask();
   // Find first batch dimension in LHS/RHS, and lower when found.
   std::vector<std::pair<int64_t, int64_t>> batchDimMap = op.getBatchDimMap();
   if (!batchDimMap.empty()) {
@@ -982,8 +979,7 @@ ContractionOpLowering::matchAndRewrite(vector::ContractionOp op,
     auto newOp = lowerParallel(rewriter, op, lhsIndex, rhsIndex, mask);
     if (failed(newOp))
       return failure();
-    rewriter.replaceOp(rootOp, *newOp);
-    return success();
+    return newOp;
   }
 
   // Collect contracting dimensions.
@@ -1003,8 +999,7 @@ ContractionOpLowering::matchAndRewrite(vector::ContractionOp op,
       auto newOp = lowerParallel(rewriter, op, lhsIndex, /*rhsIndex=*/-1, mask);
       if (failed(newOp))
         return failure();
-      rewriter.replaceOp(rootOp, *newOp);
-      return success();
+      return newOp;
     }
   }
 
@@ -1015,8 +1010,7 @@ ContractionOpLowering::matchAndRewrite(vector::ContractionOp op,
       auto newOp = lowerParallel(rewriter, op, /*lhsIndex=*/-1, rhsIndex, mask);
       if (failed(newOp))
         return failure();
-      rewriter.replaceOp(rootOp, *newOp);
-      return success();
+      return newOp;
     }
   }
 
@@ -1025,8 +1019,7 @@ ContractionOpLowering::matchAndRewrite(vector::ContractionOp op,
     auto newOp = lowerReduction(rewriter, op, mask);
     if (failed(newOp))
       return failure();
-    rewriter.replaceOp(rootOp, *newOp);
-    return success();
+    return newOp;
   }
 
   return failure();
@@ -1291,12 +1284,11 @@ public:
 /// This only kicks in when VectorTransformsOptions is set to `Matmul`.
 /// vector.transpose operations are inserted if the vector.contract op is not a
 /// row-major matrix multiply.
-LogicalResult
-ContractionOpToMatmulOpLowering::matchAndRewrite(vector::ContractionOp op,
-                                                 PatternRewriter &rew) const {
+FailureOr<Value> ContractionOpToMatmulOpLowering::matchAndRewriteMaskableOp(
+    vector::ContractionOp op, MaskingOpInterface maskOp,
+    PatternRewriter &rew) const {
   // TODO: Support vector.mask.
-  auto maskableOp = cast<MaskableOpInterface>(op.getOperation());
-  if (maskableOp.isMasked())
+  if (maskOp)
     return failure();
 
   if (vectorTransformOptions.vectorContractLowering !=
@@ -1379,8 +1371,7 @@ ContractionOpToMatmulOpLowering::matchAndRewrite(vector::ContractionOp op,
           : static_cast<Value>(
                 rew.create<arith::AddFOp>(loc, op.getAcc(), mul));
 
-  rew.replaceOp(op, res);
-  return success();
+  return res;
 }
 } // namespace
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
index 2e99f39ed86d..617c89a84ee0 100644
--- a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt
@@ -11,5 +11,7 @@ add_mlir_dialect_library(MLIRXeGPUDialect
   MLIRXeGPUEnumsIncGen
 
   LINK_LIBS PUBLIC
+  MLIRDialectUtils
   MLIRIR
+  MLIRViewLikeInterface
 )
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 4f839ee77347..0b3f4b9c9dbe 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -6,7 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <mlir/Dialect/XeGPU/IR/XeGPU.h>
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "llvm/ADT/TypeSwitch.h"
 
 namespace mlir {
 namespace xegpu {
@@ -26,8 +29,72 @@ void XeGPUDialect::initialize() {
       >();
 }
 
-// this file is for position occupation,
-// we will add functions in following PRs.
+//===----------------------------------------------------------------------===//
+// XeGPU_TensorDescAttr
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XeGPU_TensorDescType
+//===----------------------------------------------------------------------===//
+mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
+  llvm::SmallVector<int64_t> shape;
+  mlir::Type elementType;
+  mlir::FailureOr<mlir::Attribute> encoding;
+
+  // Parse literal '<'
+  if (parser.parseLess())
+    return {};
+
+  auto shapeLoc = parser.getCurrentLocation();
+  if (mlir::failed(parser.parseDimensionList(shape))) {
+    parser.emitError(shapeLoc, "failed to parse parameter 'shape'");
+    return {};
+  }
+
+  auto elemTypeLoc = parser.getCurrentLocation();
+  if (mlir::failed(parser.parseType(elementType))) {
+    parser.emitError(elemTypeLoc, "failed to parse parameter 'elementType'");
+    return {};
+  }
+
+  // parse optional attributes
+  if (mlir::succeeded(parser.parseOptionalComma())) {
+    encoding = mlir::FieldParser<mlir::Attribute>::parse(parser);
+    if (mlir::failed(encoding)) {
+      parser.emitError(
+          parser.getCurrentLocation(),
+          "Failed to parse the attribute field for TensorDescType.\n");
+      return {};
+    }
+  }
+
+  // Parse literal '>'
+  if (parser.parseGreater())
+    return {};
+
+  return TensorDescType::get(parser.getContext(), shape, elementType,
+                             encoding.value_or(mlir::Attribute()));
+}
+
+void TensorDescType::print(::mlir::AsmPrinter &printer) const {
+  printer << "<";
+
+  auto shape = getShape();
+  for (int64_t dim : shape) {
+    if (mlir::ShapedType::isDynamic(dim))
+      printer << '?';
+    else
+      printer << dim;
+    printer << 'x';
+  }
+
+  printer << getElementType();
+
+  if (auto encoding = getEncoding())
+    printer << ", " << encoding;
+
+  printer << ">";
+}
 
 } // namespace xegpu
 } // namespace mlir
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index b356c397fb83..02106f221f32 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -6,15 +6,196 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <mlir/Dialect/XeGPU/IR/XeGPU.h>
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/IR/Builders.h"
 
 #define DEBUG_TYPE "xegpu"
 
 namespace mlir {
 namespace xegpu {
 
-// this file is for position occupation,
-// we will add functions in following PRs.
+static void transpose(llvm::ArrayRef<int64_t> trans,
+                      std::vector<int64_t> &shape) {
+  std::vector<int64_t> old = shape;
+  for (size_t i = 0; i < trans.size(); i++)
+    shape[i] = old[trans[i]];
+}
+
+template <typename T>
+static std::string makeString(T array, bool breakline = false) {
+  std::string buf;
+  buf.clear();
+  llvm::raw_string_ostream os(buf);
+  os << "[";
+  for (size_t i = 1; i < array.size(); i++) {
+    os << array[i - 1] << ", ";
+    if (breakline)
+      os << "\n\t\t";
+  }
+  os << array.back() << "]";
+  os.flush();
+  return buf;
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_CreateNdDescOp
+//===----------------------------------------------------------------------===//
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+                           Type tdesc, TypedValue<MemRefType> source,
+                           llvm::ArrayRef<OpFoldResult> offsets) {
+  [[maybe_unused]] auto ty = source.getType();
+  assert(ty.hasStaticShape() && offsets.size() == (size_t)ty.getRank());
+
+  llvm::SmallVector<int64_t> staticOffsets;
+  llvm::SmallVector<Value> dynamicOffsets;
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+
+  build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */,
+        ValueRange({}) /* empty dynamic shape */,
+        ValueRange({}) /* empty dynamic strides */,
+        staticOffsets /* const offsets */, {} /* empty const shape*/,
+        {} /* empty const strides*/);
+}
+
+void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
+                           Type tdesc, TypedValue<IntegerType> source,
+                           llvm::ArrayRef<OpFoldResult> offsets,
+                           llvm::ArrayRef<OpFoldResult> shape,
+                           llvm::ArrayRef<OpFoldResult> strides) {
+  assert(shape.size() && offsets.size() && strides.size() &&
+         shape.size() == strides.size() && shape.size() == offsets.size());
+
+  llvm::SmallVector<int64_t> staticOffsets;
+  llvm::SmallVector<int64_t> staticShape;
+  llvm::SmallVector<int64_t> staticStrides;
+  llvm::SmallVector<Value> dynamicOffsets;
+  llvm::SmallVector<Value> dynamicShape;
+  llvm::SmallVector<Value> dynamicStrides;
+
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+  dispatchIndexOpFoldResults(shape, dynamicShape, staticShape);
+  dispatchIndexOpFoldResults(strides, dynamicStrides, staticOffsets);
+
+  auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets);
+  auto staticShapeAttr = builder.getDenseI64ArrayAttr(staticShape);
+  auto staticStridesAttr = builder.getDenseI64ArrayAttr(staticStrides);
+
+  build(builder, state, tdesc, source, dynamicOffsets, dynamicShape,
+        dynamicStrides, staticOffsetsAttr, staticShapeAttr, staticStridesAttr);
+}
+
+LogicalResult CreateNdDescOp::verify() {
+  auto rank = (int64_t)getMixedOffsets().size();
+  bool invalidRank = (rank != 2);
+  bool invalidElemTy = false;
+
+  // check source type matches the rank if it is a memref.
+  // It also should have the same ElementType as TensorDesc.
+  auto memrefTy = getSourceType().dyn_cast<MemRefType>();
+  if (memrefTy) {
+    invalidRank |= (memrefTy.getRank() != rank);
+    invalidElemTy |= memrefTy.getElementType() != getElementType();
+  }
+
+  // check result type matches the rank
+  invalidRank = (getType().getRank() != rank);
+
+  // mismatches among shape, strides, and offsets are
+  // already handeled by OffsetSizeAndStrideOpInterface.
+  // So they are not check here.
+  if (invalidRank)
+    return emitOpError(
+        "Expecting the rank of shape, strides, offsets, "
+        "source memref type (if source is a memref) and TensorDesc "
+        "should match with each other. They currenlty are 2D.");
+
+  if (invalidElemTy)
+    return emitOpError("TensorDesc should have the same element "
+                       "type with the source if it is a memref.\n");
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_LoadNdOp
+//===----------------------------------------------------------------------===//
+LogicalResult LoadNdOp::verify() {
+  auto tdescTy = getTensorDescType();
+  auto valueTy = getType();
+
+  if (tdescTy.getRank() != 2)
+    return emitOpError(
+        "The TensorDesc for LoadNdOp should be a 2D TensorDesc.");
+
+  if (!valueTy)
+    return emitOpError("Invalid result, it should be a VectorType.\n");
+
+  auto tdescElemTy = tdescTy.getElementType();
+  auto valueElemTy = valueTy.getElementType();
+
+  if (tdescElemTy != valueElemTy)
+    return emitOpError(
+        "Value should have the same element type as TensorDesc.");
+
+  auto array_len = tdescTy.getArrayLength();
+  auto tdescShape = tdescTy.getShape().vec();
+  auto valueShape = valueTy.getShape().vec();
+
+  if (getTranspose()) {
+    auto trans = getTranspose().value();
+    if (tdescShape.size() >= trans.size())
+      transpose(trans, tdescShape);
+    else
+      emitWarning("Invalid transpose attr. It is ignored.");
+  }
+
+  if (getVnniAxis()) {
+    auto axis = getVnniAxis().value();
+    auto vnni_factor = valueShape.back();
+    tdescShape[axis] /= vnni_factor;
+    tdescShape.push_back(vnni_factor);
+  }
+
+  if (array_len > 1) {
+    auto it = tdescShape.begin();
+    tdescShape.insert(it, array_len);
+  }
+
+  if (tdescShape != valueShape)
+    return emitOpError() << "Result shape doesn't match TensorDesc shape."
+                         << "The expected shape is " << makeString(tdescShape)
+                         << ". But the given shape is "
+                         << makeString(valueShape) << ".\n";
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_StoreNdOp
+//===----------------------------------------------------------------------===//
+LogicalResult StoreNdOp::verify() {
+  auto dstTy = getTensorDesc().getType();               // Tile
+  auto valTy = getValue().getType().cast<VectorType>(); // Vector
+
+  if (dstTy.getRank() != 2)
+    return emitOpError("Expecting a 2D TensorDesc shape.\n");
+
+  if (!valTy)
+    return emitOpError("Exepcting a VectorType result.\n");
+
+  auto dstElemTy = dstTy.getElementType();
+  auto valElemTy = valTy.getElementType();
+
+  if (dstElemTy != valElemTy) {
+    return emitOpError() << "The element type of the value should "
+                            "match the elementtype of the TensorDesc.\n";
+  }
+
+  if (dstTy.getShape() != valTy.getShape())
+    return emitOpError()
+           << "The result shape should match the TensorDesc shape.\n";
+  return success();
+}
 
 } // namespace xegpu
 } // namespace mlir
diff --git a/mlir/lib/Interfaces/DataLayoutInterfaces.cpp b/mlir/lib/Interfaces/DataLayoutInterfaces.cpp
index 65c41f44192a..b5b7d78cfeff 100644
--- a/mlir/lib/Interfaces/DataLayoutInterfaces.cpp
+++ b/mlir/lib/Interfaces/DataLayoutInterfaces.cpp
@@ -218,7 +218,23 @@ uint64_t mlir::detail::getDefaultPreferredAlignment(
   reportMissingDataLayout(type);
 }
 
-// Returns the memory space used for allocal operations if specified in the
+std::optional<uint64_t> mlir::detail::getDefaultIndexBitwidth(
+    Type type, const DataLayout &dataLayout,
+    ArrayRef<DataLayoutEntryInterface> params) {
+  if (isa<IndexType>(type))
+    return getIndexBitwidth(params);
+
+  if (auto typeInterface = dyn_cast<DataLayoutTypeInterface>(type))
+    if (std::optional<uint64_t> indexBitwidth =
+            typeInterface.getIndexBitwidth(dataLayout, params))
+      return *indexBitwidth;
+
+  // Return std::nullopt for all other types, which are assumed to be non
+  // pointer-like types.
+  return std::nullopt;
+}
+
+// Returns the memory space used for alloca operations if specified in the
 // given entry. If the entry is empty the default memory space represented by
 // an empty attribute is returned.
 Attribute
@@ -520,6 +536,18 @@ uint64_t mlir::DataLayout::getTypePreferredAlignment(Type t) const {
   });
 }
 
+std::optional<uint64_t> mlir::DataLayout::getTypeIndexBitwidth(Type t) const {
+  checkValid();
+  return cachedLookup<std::optional<uint64_t>>(t, indexBitwidths, [&](Type ty) {
+    DataLayoutEntryList list;
+    if (originalLayout)
+      list = originalLayout.getSpecForType(ty.getTypeID());
+    if (auto iface = dyn_cast_or_null<DataLayoutOpInterface>(scope))
+      return iface.getIndexBitwidth(ty, *this, list);
+    return detail::getDefaultIndexBitwidth(ty, *this, list);
+  });
+}
+
 mlir::Attribute mlir::DataLayout::getAllocaMemorySpace() const {
   checkValid();
   if (allocaMemorySpace)
diff --git a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
index 85abc2df8947..06ec3f4e135e 100644
--- a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
+++ b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
@@ -70,6 +70,8 @@ static std::optional<int64_t> getConstantIntValue(OpFoldResult ofr) {
 ValueBoundsConstraintSet::ValueBoundsConstraintSet(MLIRContext *ctx)
     : builder(ctx) {}
 
+char ValueBoundsConstraintSet::ID = 0;
+
 #ifndef NDEBUG
 static void assertValidValueDim(Value value, std::optional<int64_t> dim) {
   if (value.getType().isIndex()) {
@@ -472,54 +474,86 @@ FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
 }
 
 FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
+    presburger::BoundType type, AffineMap map, ArrayRef<Value> operands,
+    StopConditionFn stopCondition, bool closedUB) {
+  ValueDimList valueDims;
+  for (Value v : operands) {
+    assert(v.getType().isIndex() && "expected index type");
+    valueDims.emplace_back(v, std::nullopt);
+  }
+  return computeConstantBound(type, map, valueDims, stopCondition, closedUB);
+}
+
+FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
     presburger::BoundType type, AffineMap map, ValueDimList operands,
     StopConditionFn stopCondition, bool closedUB) {
   assert(map.getNumResults() == 1 && "expected affine map with one result");
   ValueBoundsConstraintSet cstr(map.getContext());
-  int64_t pos = cstr.insert(/*isSymbol=*/false);
+
+  int64_t pos = 0;
+  if (stopCondition) {
+    cstr.populateConstraintsSet(map, operands, stopCondition, &pos);
+  } else {
+    // No stop condition specified: Keep adding constraints until a bound could
+    // be computed.
+    cstr.populateConstraintsSet(
+        map, operands,
+        [&](Value v, std::optional<int64_t> dim) {
+          return cstr.cstr.getConstantBound64(type, pos).has_value();
+        },
+        &pos);
+  }
+  // Compute constant bound for `valueDim`.
+  int64_t ubAdjustment = closedUB ? 0 : 1;
+  if (auto bound = cstr.cstr.getConstantBound64(type, pos))
+    return type == BoundType::UB ? *bound + ubAdjustment : *bound;
+  return failure();
+}
+
+int64_t ValueBoundsConstraintSet::populateConstraintsSet(
+    Value value, std::optional<int64_t> dim, StopConditionFn stopCondition) {
+#ifndef NDEBUG
+  assertValidValueDim(value, dim);
+#endif // NDEBUG
+
+  AffineMap map =
+      AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0,
+                     Builder(value.getContext()).getAffineDimExpr(0));
+  return populateConstraintsSet(map, {{value, dim}}, stopCondition);
+}
+
+int64_t ValueBoundsConstraintSet::populateConstraintsSet(
+    AffineMap map, ValueDimList operands, StopConditionFn stopCondition,
+    int64_t *posOut) {
+  assert(map.getNumResults() == 1 && "expected affine map with one result");
+  int64_t pos = insert(/*isSymbol=*/false);
+  if (posOut)
+    *posOut = pos;
 
   // Add map and operands to the constraint set. Dimensions are converted to
   // symbols. All operands are added to the worklist.
   auto mapper = [&](std::pair<Value, std::optional<int64_t>> v) {
-    return cstr.getExpr(v.first, v.second);
+    return getExpr(v.first, v.second);
   };
   SmallVector<AffineExpr> dimReplacements = llvm::to_vector(
       llvm::map_range(ArrayRef(operands).take_front(map.getNumDims()), mapper));
   SmallVector<AffineExpr> symReplacements = llvm::to_vector(
       llvm::map_range(ArrayRef(operands).drop_front(map.getNumDims()), mapper));
-  cstr.addBound(
+  addBound(
       presburger::BoundType::EQ, pos,
       map.getResult(0).replaceDimsAndSymbols(dimReplacements, symReplacements));
 
   // Process the backward slice of `operands` (i.e., reverse use-def chain)
   // until `stopCondition` is met.
   if (stopCondition) {
-    cstr.processWorklist(stopCondition);
+    processWorklist(stopCondition);
   } else {
-    // No stop condition specified: Keep adding constraints until a bound could
-    // be computed.
-    cstr.processWorklist(
-        /*stopCondition=*/[&](Value v, std::optional<int64_t> dim) {
-          return cstr.cstr.getConstantBound64(type, pos).has_value();
-        });
+    // No stop condition specified: Keep adding constraints until the worklist
+    // is empty.
+    processWorklist([](Value v, std::optional<int64_t> dim) { return false; });
   }
 
-  // Compute constant bound for `valueDim`.
-  int64_t ubAdjustment = closedUB ? 0 : 1;
-  if (auto bound = cstr.cstr.getConstantBound64(type, pos))
-    return type == BoundType::UB ? *bound + ubAdjustment : *bound;
-  return failure();
-}
-
-FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
-    presburger::BoundType type, AffineMap map, ArrayRef<Value> operands,
-    StopConditionFn stopCondition, bool closedUB) {
-  ValueDimList valueDims;
-  for (Value v : operands) {
-    assert(v.getType().isIndex() && "expected index type");
-    valueDims.emplace_back(v, std::nullopt);
-  }
-  return computeConstantBound(type, map, valueDims, stopCondition, closedUB);
+  return pos;
 }
 
 FailureOr<int64_t>
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index 995544238e4a..f90495d407fd 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -274,16 +274,15 @@ translateDataLayout(DataLayoutSpecInterface attribute,
                 layoutStream << ":" << preferred;
               return success();
             })
-            .Case([&](LLVMPointerType ptrType) {
-              layoutStream << "p" << ptrType.getAddressSpace() << ":";
+            .Case([&](LLVMPointerType type) {
+              layoutStream << "p" << type.getAddressSpace() << ":";
               uint64_t size = dataLayout.getTypeSizeInBits(type);
               uint64_t abi = dataLayout.getTypeABIAlignment(type) * 8u;
               uint64_t preferred =
                   dataLayout.getTypePreferredAlignment(type) * 8u;
-              layoutStream << size << ":" << abi << ":" << preferred;
-              if (std::optional<uint64_t> index = extractPointerSpecValue(
-                      entry.getValue(), PtrDLEntryPos::Index))
-                layoutStream << ":" << *index;
+              uint64_t index = *dataLayout.getTypeIndexBitwidth(type);
+              layoutStream << size << ":" << abi << ":" << preferred << ":"
+                           << index;
               return success();
             })
             .Default([loc](Type type) {
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index dd3b6c2080aa..8877ee083286 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -112,7 +112,7 @@ gpu.module @test_module_3 {
 
 gpu.module @test_module_4 {
   // CHECK-LABEL: func @gpu_shuffle()
-  func.func @gpu_shuffle() -> (f32, f32, f32, f32) {
+  func.func @gpu_shuffle() -> (f32, f32, f32, f32, i1, i1, i1, i1) {
     // CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : f32
     %arg0 = arith.constant 1.0 : f32
     // CHECK: %[[#OFFSET:]] = llvm.mlir.constant(4 : i32) : i32
@@ -143,11 +143,41 @@ gpu.module @test_module_4 {
     // CHECK: nvvm.shfl.sync idx {{.*}} {return_value_and_is_valid} : f32 -> !llvm.struct<(f32, i1)>
     %shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32
 
-    func.return %shfl, %shflu, %shfld, %shfli : f32, f32,f32, f32
+    func.return %shfl, %shflu, %shfld, %shfli, %pred, %predu, %predd, %predi
+      : f32, f32,f32, f32, i1, i1, i1, i1
   }
-}
 
+  // CHECK-LABEL: func @gpu_shuffle_unused_pred()
+  func.func @gpu_shuffle_unused_pred() -> (f32, f32, f32, f32) {
+    // CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : f32
+    %arg0 = arith.constant 1.0 : f32
+    // CHECK: %[[#OFFSET:]] = llvm.mlir.constant(4 : i32) : i32
+    %arg1 = arith.constant 4 : i32
+    // CHECK: %[[#WIDTH:]] = llvm.mlir.constant(23 : i32) : i32
+    %arg2 = arith.constant 23 : i32
+    // CHECK: %[[#ONE:]] = llvm.mlir.constant(1 : i32) : i32
+    // CHECK: %[[#MINUS_ONE:]] = llvm.mlir.constant(-1 : i32) : i32
+    // CHECK: %[[#THIRTY_TWO:]] = llvm.mlir.constant(32 : i32) : i32
+    // CHECK: %[[#NUM_LANES:]] = llvm.sub %[[#THIRTY_TWO]], %[[#WIDTH]] : i32
+    // CHECK: %[[#MASK:]] = llvm.lshr %[[#MINUS_ONE]], %[[#NUM_LANES]] : i32
+    // CHECK: %[[#CLAMP:]] = llvm.sub %[[#WIDTH]], %[[#ONE]] : i32
+    // CHECK: %[[#SHFL:]] = nvvm.shfl.sync bfly %[[#MASK]], %[[#VALUE]], %[[#OFFSET]], %[[#CLAMP]] : f32 -> f32
+    %shfl, %pred = gpu.shuffle xor %arg0, %arg1, %arg2 : f32
+    // CHECK: %[[#ONE:]] = llvm.mlir.constant(1 : i32) : i32
+    // CHECK: %[[#MINUS_ONE:]] = llvm.mlir.constant(-1 : i32) : i32
+    // CHECK: %[[#THIRTY_TWO:]] = llvm.mlir.constant(32 : i32) : i32
+    // CHECK: %[[#NUM_LANES:]] = llvm.sub %[[#THIRTY_TWO]], %[[#WIDTH]] : i32
+    // CHECK: %[[#MASK:]] = llvm.lshr %[[#MINUS_ONE]], %[[#NUM_LANES]] : i32
+    // CHECK: %[[#SHFL:]] = nvvm.shfl.sync up %[[#MASK]], %[[#VALUE]], %[[#OFFSET]], %[[#NUM_LANES]] : f32 -> f32
+    %shflu, %predu = gpu.shuffle up %arg0, %arg1, %arg2 : f32
+    // CHECK: nvvm.shfl.sync down {{.*}} : f32 -> f32
+    %shfld, %predd = gpu.shuffle down %arg0, %arg1, %arg2 : f32
+    // CHECK: nvvm.shfl.sync idx {{.*}} : f32 -> f32
+    %shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32
 
+    func.return %shfl, %shflu, %shfld, %shfli : f32, f32,f32, f32
+  }
+}
 
 gpu.module @test_module_5 {
   // CHECK-LABEL: func @gpu_sync()
diff --git a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir
index fa12da8ef9d4..4339799ccd5e 100644
--- a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir
+++ b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir
@@ -60,13 +60,9 @@ module attributes {
       // CHECK: %[[INDEX2:.*]] = spirv.IAdd %[[ARG4]], %[[LOCALINVOCATIONIDX]]
       %13 = arith.addi %arg4, %3 : index
       // CHECK: %[[ZERO:.*]] = spirv.Constant 0 : i32
-      // CHECK: %[[OFFSET1_0:.*]] = spirv.Constant 0 : i32
       // CHECK: %[[STRIDE1_1:.*]] = spirv.Constant 4 : i32
-      // CHECK: %[[UPDATE1_1:.*]] = spirv.IMul %[[STRIDE1_1]], %[[INDEX1]] : i32
-      // CHECK: %[[OFFSET1_1:.*]] = spirv.IAdd %[[OFFSET1_0]], %[[UPDATE1_1]] : i32
-      // CHECK: %[[STRIDE1_2:.*]] = spirv.Constant 1 : i32
-      // CHECK: %[[UPDATE1_2:.*]] = spirv.IMul %[[STRIDE1_2]], %[[INDEX2]] : i32
-      // CHECK: %[[OFFSET1_2:.*]] = spirv.IAdd %[[OFFSET1_1]], %[[UPDATE1_2]] : i32
+      // CHECK: %[[UPDATE1_1:.*]] = spirv.IMul %[[INDEX1]], %[[STRIDE1_1]] : i32
+      // CHECK: %[[OFFSET1_2:.*]] = spirv.IAdd %[[INDEX2]], %[[UPDATE1_1]] : i32
       // CHECK: %[[PTR1:.*]] = spirv.AccessChain %[[ARG0]]{{\[}}%[[ZERO]], %[[OFFSET1_2]]{{\]}}
       // CHECK-NEXT: %[[VAL1:.*]] = spirv.Load "StorageBuffer" %[[PTR1]]
       %14 = memref.load %arg0[%12, %13] : memref<12x4xf32, #spirv.storage_class<StorageBuffer>>
diff --git a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir
new file mode 100644
index 000000000000..390190d341e5
--- /dev/null
+++ b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir
@@ -0,0 +1,40 @@
+// RUN: mlir-opt -convert-memref-to-emitc %s -split-input-file -verify-diagnostics
+
+func.func @memref_op(%arg0 : memref<2x4xf32>) {
+  // expected-error@+1 {{failed to legalize operation 'memref.copy'}}
+  memref.copy %arg0, %arg0 : memref<2x4xf32> to memref<2x4xf32>
+  return
+}
+
+// -----
+
+func.func @alloca_with_dynamic_shape() {
+  %0 = index.constant 1
+  // expected-error@+1 {{failed to legalize operation 'memref.alloca'}}
+  %1 = memref.alloca(%0) : memref<4x?xf32>
+  return
+}
+
+// -----
+
+func.func @alloca_with_alignment() {
+  // expected-error@+1 {{failed to legalize operation 'memref.alloca'}}
+  %0 = memref.alloca() {alignment = 64 : i64}: memref<4xf32>
+  return
+}
+
+// -----
+
+func.func @non_identity_layout() {
+  // expected-error@+1 {{failed to legalize operation 'memref.alloca'}}
+  %0 = memref.alloca() : memref<4x3xf32, affine_map<(d0, d1) -> (d1, d0)>>
+  return
+}
+
+// -----
+
+func.func @zero_rank() {
+  // expected-error@+1 {{failed to legalize operation 'memref.alloca'}}
+  %0 = memref.alloca() : memref<f32>
+  return
+}
diff --git a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir
new file mode 100644
index 000000000000..9793b2d6d783
--- /dev/null
+++ b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir
@@ -0,0 +1,28 @@
+// RUN: mlir-opt -convert-memref-to-emitc %s -split-input-file | FileCheck %s
+
+// CHECK-LABEL: memref_store
+// CHECK-SAME:  %[[v:.*]]: f32, %[[i:.*]]: index, %[[j:.*]]: index
+func.func @memref_store(%v : f32, %i: index, %j: index) {
+  // CHECK: %[[ALLOCA:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.array<4x8xf32>
+  %0 = memref.alloca() : memref<4x8xf32>
+
+  // CHECK: %[[SUBSCRIPT:.*]] = emitc.subscript %[[ALLOCA]][%[[i]], %[[j]]] : <4x8xf32>
+  // CHECK: emitc.assign %[[v]] : f32 to %[[SUBSCRIPT:.*]] : f32
+  memref.store %v, %0[%i, %j] : memref<4x8xf32>
+  return
+}
+// -----
+
+// CHECK-LABEL: memref_load
+// CHECK-SAME:  %[[i:.*]]: index, %[[j:.*]]: index
+func.func @memref_load(%i: index, %j: index) -> f32 {
+  // CHECK: %[[ALLOCA:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> !emitc.array<4x8xf32>
+  %0 = memref.alloca() : memref<4x8xf32>
+
+  // CHECK: %[[LOAD:.*]] = emitc.subscript %[[ALLOCA]][%[[i]], %[[j]]] : <4x8xf32>
+  // CHECK: %[[VAR:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
+  // CHECK: emitc.assign %[[LOAD]] : f32 to %[[VAR]] : f32
+  %1 = memref.load %0[%i, %j] : memref<4x8xf32>
+  // CHECK: return %[[VAR]] : f32
+  return %1 : f32
+}
diff --git a/mlir/test/Conversion/MemRefToSPIRV/bitwidth-emulation.mlir b/mlir/test/Conversion/MemRefToSPIRV/bitwidth-emulation.mlir
index 470c8531e2e0..52ed14e8cce2 100644
--- a/mlir/test/Conversion/MemRefToSPIRV/bitwidth-emulation.mlir
+++ b/mlir/test/Conversion/MemRefToSPIRV/bitwidth-emulation.mlir
@@ -12,16 +12,10 @@ module attributes {
 // CHECK-LABEL: @load_i1
 func.func @load_i1(%arg0: memref<i1, #spirv.storage_class<StorageBuffer>>) -> i1 {
   //     CHECK: %[[ZERO:.+]] = spirv.Constant 0 : i32
-  //     CHECK: %[[FOUR:.+]] = spirv.Constant 4 : i32
-  //     CHECK: %[[QUOTIENT:.+]] = spirv.SDiv %[[ZERO]], %[[FOUR]] : i32
-  //     CHECK: %[[PTR:.+]] = spirv.AccessChain %{{.+}}[%[[ZERO]], %[[QUOTIENT]]]
+  //     CHECK: %[[PTR:.+]] = spirv.AccessChain %{{.+}}[%[[ZERO]], %[[ZERO]]]
   //     CHECK: %[[LOAD:.+]] = spirv.Load  "StorageBuffer" %[[PTR]]
-  //     CHECK: %[[EIGHT:.+]] = spirv.Constant 8 : i32
-  //     CHECK: %[[IDX:.+]] = spirv.UMod %[[ZERO]], %[[FOUR]] : i32
-  //     CHECK: %[[BITS:.+]] = spirv.IMul %[[IDX]], %[[EIGHT]] : i32
-  //     CHECK: %[[VALUE:.+]] = spirv.ShiftRightArithmetic %[[LOAD]], %[[BITS]] : i32, i32
   //     CHECK: %[[MASK:.+]] = spirv.Constant 255 : i32
-  //     CHECK: %[[T1:.+]] = spirv.BitwiseAnd %[[VALUE]], %[[MASK]] : i32
+  //     CHECK: %[[T1:.+]] = spirv.BitwiseAnd %[[LOAD]], %[[MASK]] : i32
   //     CHECK: %[[T2:.+]] = spirv.Constant 24 : i32
   //     CHECK: %[[T3:.+]] = spirv.ShiftLeftLogical %[[T1]], %[[T2]] : i32, i32
   //     CHECK: %[[T4:.+]] = spirv.ShiftRightArithmetic %[[T3]], %[[T2]] : i32, i32
@@ -37,32 +31,20 @@ func.func @load_i1(%arg0: memref<i1, #spirv.storage_class<StorageBuffer>>) -> i1
 // INDEX64-LABEL: @load_i8
 func.func @load_i8(%arg0: memref<i8, #spirv.storage_class<StorageBuffer>>) -> i8 {
   //     CHECK: %[[ZERO:.+]] = spirv.Constant 0 : i32
-  //     CHECK: %[[FOUR:.+]] = spirv.Constant 4 : i32
-  //     CHECK: %[[QUOTIENT:.+]] = spirv.SDiv %[[ZERO]], %[[FOUR]] : i32
-  //     CHECK: %[[PTR:.+]] = spirv.AccessChain %{{.+}}[%[[ZERO]], %[[QUOTIENT]]]
+  //     CHECK: %[[PTR:.+]] = spirv.AccessChain %{{.+}}[%[[ZERO]], %[[ZERO]]]
   //     CHECK: %[[LOAD:.+]] = spirv.Load  "StorageBuffer" %[[PTR]]
-  //     CHECK: %[[EIGHT:.+]] = spirv.Constant 8 : i32
-  //     CHECK: %[[IDX:.+]] = spirv.UMod %[[ZERO]], %[[FOUR]] : i32
-  //     CHECK: %[[BITS:.+]] = spirv.IMul %[[IDX]], %[[EIGHT]] : i32
-  //     CHECK: %[[VALUE:.+]] = spirv.ShiftRightArithmetic %[[LOAD]], %[[BITS]] : i32, i32
   //     CHECK: %[[MASK:.+]] = spirv.Constant 255 : i32
-  //     CHECK: %[[T1:.+]] = spirv.BitwiseAnd %[[VALUE]], %[[MASK]] : i32
+  //     CHECK: %[[T1:.+]] = spirv.BitwiseAnd %[[LOAD]], %[[MASK]] : i32
   //     CHECK: %[[T2:.+]] = spirv.Constant 24 : i32
   //     CHECK: %[[T3:.+]] = spirv.ShiftLeftLogical %[[T1]], %[[T2]] : i32, i32
   //     CHECK: %[[SR:.+]] = spirv.ShiftRightArithmetic %[[T3]], %[[T2]] : i32, i32
   //     CHECK: builtin.unrealized_conversion_cast %[[SR]]
 
   //   INDEX64: %[[ZERO:.+]] = spirv.Constant 0 : i64
-  //   INDEX64: %[[FOUR:.+]] = spirv.Constant 4 : i64
-  //   INDEX64: %[[QUOTIENT:.+]] = spirv.SDiv %[[ZERO]], %[[FOUR]] : i64
-  //   INDEX64: %[[PTR:.+]] = spirv.AccessChain %{{.+}}[%[[ZERO]], %[[QUOTIENT]]] : {{.+}}, i64, i64
+  //   INDEX64: %[[PTR:.+]] = spirv.AccessChain %{{.+}}[%[[ZERO]], %[[ZERO]]] : {{.+}}, i64, i64
   //   INDEX64: %[[LOAD:.+]] = spirv.Load  "StorageBuffer" %[[PTR]] : i32
-  //   INDEX64: %[[EIGHT:.+]] = spirv.Constant 8 : i64
-  //   INDEX64: %[[IDX:.+]] = spirv.UMod %[[ZERO]], %[[FOUR]] : i64
-  //   INDEX64: %[[BITS:.+]] = spirv.IMul %[[IDX]], %[[EIGHT]] : i64
-  //   INDEX64: %[[VALUE:.+]] = spirv.ShiftRightArithmetic %[[LOAD]], %[[BITS]] : i32, i64
   //   INDEX64: %[[MASK:.+]] = spirv.Constant 255 : i32
-  //   INDEX64: %[[T1:.+]] = spirv.BitwiseAnd %[[VALUE]], %[[MASK]] : i32
+  //   INDEX64: %[[T1:.+]] = spirv.BitwiseAnd %[[LOAD]], %[[MASK]] : i32
   //   INDEX64: %[[T2:.+]] = spirv.Constant 24 : i32
   //   INDEX64: %[[T3:.+]] = spirv.ShiftLeftLogical %[[T1]], %[[T2]] : i32, i32
   //   INDEX64: %[[SR:.+]] = spirv.ShiftRightArithmetic %[[T3]], %[[T2]] : i32, i32
@@ -76,15 +58,12 @@ func.func @load_i8(%arg0: memref<i8, #spirv.storage_class<StorageBuffer>>) -> i8
 func.func @load_i16(%arg0: memref<10xi16, #spirv.storage_class<StorageBuffer>>, %index : index) -> i16 {
   //     CHECK: %[[ARG1_CAST:.+]] = builtin.unrealized_conversion_cast %[[ARG1]] : index to i32
   //     CHECK: %[[ZERO:.+]] = spirv.Constant 0 : i32
-  //     CHECK: %[[ONE:.+]] = spirv.Constant 1 : i32
-  //     CHECK: %[[UPDATE:.+]] = spirv.IMul %[[ONE]], %[[ARG1_CAST]] : i32
-  //     CHECK: %[[FLAT_IDX:.+]] = spirv.IAdd %[[ZERO]], %[[UPDATE]] : i32
   //     CHECK: %[[TWO:.+]] = spirv.Constant 2 : i32
-  //     CHECK: %[[QUOTIENT:.+]] = spirv.SDiv %[[FLAT_IDX]], %[[TWO]] : i32
+  //     CHECK: %[[QUOTIENT:.+]] = spirv.SDiv %[[ARG1_CAST]], %[[TWO]] : i32
   //     CHECK: %[[PTR:.+]] = spirv.AccessChain %{{.+}}[%[[ZERO]], %[[QUOTIENT]]]
   //     CHECK: %[[LOAD:.+]] = spirv.Load  "StorageBuffer" %[[PTR]]
   //     CHECK: %[[SIXTEEN:.+]] = spirv.Constant 16 : i32
-  //     CHECK: %[[IDX:.+]] = spirv.UMod %[[FLAT_IDX]], %[[TWO]] : i32
+  //     CHECK: %[[IDX:.+]] = spirv.UMod %[[ARG1_CAST]], %[[TWO]] : i32
   //     CHECK: %[[BITS:.+]] = spirv.IMul %[[IDX]], %[[SIXTEEN]] : i32
   //     CHECK: %[[VALUE:.+]] = spirv.ShiftRightArithmetic %[[LOAD]], %[[BITS]] : i32, i32
   //     CHECK: %[[MASK:.+]] = spirv.Constant 65535 : i32
@@ -110,20 +89,12 @@ func.func @load_f32(%arg0: memref<f32, #spirv.storage_class<StorageBuffer>>) {
 func.func @store_i1(%arg0: memref<i1, #spirv.storage_class<StorageBuffer>>, %value: i1) {
   //     CHECK: %[[ARG0_CAST:.+]] = builtin.unrealized_conversion_cast %[[ARG0]]
   //     CHECK: %[[ZERO:.+]] = spirv.Constant 0 : i32
-  //     CHECK: %[[FOUR:.+]] = spirv.Constant 4 : i32
-  //     CHECK: %[[EIGHT:.+]] = spirv.Constant 8 : i32
-  //     CHECK: %[[IDX:.+]] = spirv.UMod %[[ZERO]], %[[FOUR]] : i32
-  //     CHECK: %[[OFFSET:.+]] = spirv.IMul %[[IDX]], %[[EIGHT]] : i32
-  //     CHECK: %[[MASK1:.+]] = spirv.Constant 255 : i32
-  //     CHECK: %[[TMP1:.+]] = spirv.ShiftLeftLogical %[[MASK1]], %[[OFFSET]] : i32, i32
-  //     CHECK: %[[MASK:.+]] = spirv.Not %[[TMP1]] : i32
+  //     CHECK: %[[MASK:.+]] = spirv.Constant -256 : i32
   //     CHECK: %[[ONE:.+]] = spirv.Constant 1 : i32
   //     CHECK: %[[CASTED_ARG1:.+]] = spirv.Select %[[ARG1]], %[[ONE]], %[[ZERO]] : i1, i32
-  //     CHECK: %[[STORE_VAL:.+]] = spirv.ShiftLeftLogical %[[CASTED_ARG1]], %[[OFFSET]] : i32, i32
-  //     CHECK: %[[ACCESS_IDX:.+]] = spirv.SDiv %[[ZERO]], %[[FOUR]] : i32
-  //     CHECK: %[[PTR:.+]] = spirv.AccessChain %[[ARG0_CAST]][%[[ZERO]], %[[ACCESS_IDX]]]
+  //     CHECK: %[[PTR:.+]] = spirv.AccessChain %[[ARG0_CAST]][%[[ZERO]], %[[ZERO]]]
   //     CHECK: spirv.AtomicAnd <Device> <AcquireRelease> %[[PTR]], %[[MASK]]
-  //     CHECK: spirv.AtomicOr <Device> <AcquireRelease> %[[PTR]], %[[STORE_VAL]]
+  //     CHECK: spirv.AtomicOr <Device> <AcquireRelease> %[[PTR]], %[[CASTED_ARG1]]
   memref.store %value, %arg0[] : memref<i1, #spirv.storage_class<StorageBuffer>>
   return
 }
@@ -136,36 +107,22 @@ func.func @store_i8(%arg0: memref<i8, #spirv.storage_class<StorageBuffer>>, %val
   //     CHECK-DAG: %[[ARG1_CAST:.+]] = builtin.unrealized_conversion_cast %[[ARG1]] : i8 to i32
   //     CHECK-DAG: %[[ARG0_CAST:.+]] = builtin.unrealized_conversion_cast %[[ARG0]]
   //     CHECK: %[[ZERO:.+]] = spirv.Constant 0 : i32
-  //     CHECK: %[[FOUR:.+]] = spirv.Constant 4 : i32
-  //     CHECK: %[[EIGHT:.+]] = spirv.Constant 8 : i32
-  //     CHECK: %[[IDX:.+]] = spirv.UMod %[[ZERO]], %[[FOUR]] : i32
-  //     CHECK: %[[OFFSET:.+]] = spirv.IMul %[[IDX]], %[[EIGHT]] : i32
   //     CHECK: %[[MASK1:.+]] = spirv.Constant 255 : i32
-  //     CHECK: %[[TMP1:.+]] = spirv.ShiftLeftLogical %[[MASK1]], %[[OFFSET]] : i32, i32
-  //     CHECK: %[[MASK:.+]] = spirv.Not %[[TMP1]] : i32
+  //     CHECK: %[[MASK2:.+]] = spirv.Constant -256 : i32
   //     CHECK: %[[CLAMPED_VAL:.+]] = spirv.BitwiseAnd %[[ARG1_CAST]], %[[MASK1]] : i32
-  //     CHECK: %[[STORE_VAL:.+]] = spirv.ShiftLeftLogical %[[CLAMPED_VAL]], %[[OFFSET]] : i32, i32
-  //     CHECK: %[[ACCESS_IDX:.+]] = spirv.SDiv %[[ZERO]], %[[FOUR]] : i32
-  //     CHECK: %[[PTR:.+]] = spirv.AccessChain %[[ARG0_CAST]][%[[ZERO]], %[[ACCESS_IDX]]]
-  //     CHECK: spirv.AtomicAnd <Device> <AcquireRelease> %[[PTR]], %[[MASK]]
-  //     CHECK: spirv.AtomicOr <Device> <AcquireRelease> %[[PTR]], %[[STORE_VAL]]
+  //     CHECK: %[[PTR:.+]] = spirv.AccessChain %[[ARG0_CAST]][%[[ZERO]], %[[ZERO]]]
+  //     CHECK: spirv.AtomicAnd <Device> <AcquireRelease> %[[PTR]], %[[MASK2]]
+  //     CHECK: spirv.AtomicOr <Device> <AcquireRelease> %[[PTR]], %[[CLAMPED_VAL]]
 
   //   INDEX64-DAG: %[[ARG1_CAST:.+]] = builtin.unrealized_conversion_cast %[[ARG1]] : i8 to i32
   //   INDEX64-DAG: %[[ARG0_CAST:.+]] = builtin.unrealized_conversion_cast %[[ARG0]]
   //   INDEX64: %[[ZERO:.+]] = spirv.Constant 0 : i64
-  //   INDEX64: %[[FOUR:.+]] = spirv.Constant 4 : i64
-  //   INDEX64: %[[EIGHT:.+]] = spirv.Constant 8 : i64
-  //   INDEX64: %[[IDX:.+]] = spirv.UMod %[[ZERO]], %[[FOUR]] : i64
-  //   INDEX64: %[[OFFSET:.+]] = spirv.IMul %[[IDX]], %[[EIGHT]] : i64
   //   INDEX64: %[[MASK1:.+]] = spirv.Constant 255 : i32
-  //   INDEX64: %[[TMP1:.+]] = spirv.ShiftLeftLogical %[[MASK1]], %[[OFFSET]] : i32, i64
-  //   INDEX64: %[[MASK:.+]] = spirv.Not %[[TMP1]] : i32
+  //   INDEX64: %[[MASK2:.+]] = spirv.Constant -256 : i32
   //   INDEX64: %[[CLAMPED_VAL:.+]] = spirv.BitwiseAnd %[[ARG1_CAST]], %[[MASK1]] : i32
-  //   INDEX64: %[[STORE_VAL:.+]] = spirv.ShiftLeftLogical %[[CLAMPED_VAL]], %[[OFFSET]] : i32, i64
-  //   INDEX64: %[[ACCESS_IDX:.+]] = spirv.SDiv %[[ZERO]], %[[FOUR]] : i64
-  //   INDEX64: %[[PTR:.+]] = spirv.AccessChain %[[ARG0_CAST]][%[[ZERO]], %[[ACCESS_IDX]]] : {{.+}}, i64, i64
-  //   INDEX64: spirv.AtomicAnd <Device> <AcquireRelease> %[[PTR]], %[[MASK]]
-  //   INDEX64: spirv.AtomicOr <Device> <AcquireRelease> %[[PTR]], %[[STORE_VAL]]
+  //   INDEX64: %[[PTR:.+]] = spirv.AccessChain %[[ARG0_CAST]][%[[ZERO]], %[[ZERO]]] : {{.+}}, i64, i64
+  //   INDEX64: spirv.AtomicAnd <Device> <AcquireRelease> %[[PTR]], %[[MASK2]]
+  //   INDEX64: spirv.AtomicOr <Device> <AcquireRelease> %[[PTR]], %[[CLAMPED_VAL]]
   memref.store %value, %arg0[] : memref<i8, #spirv.storage_class<StorageBuffer>>
   return
 }
@@ -177,19 +134,16 @@ func.func @store_i16(%arg0: memref<10xi16, #spirv.storage_class<StorageBuffer>>,
   //     CHECK-DAG: %[[ARG0_CAST:.+]] = builtin.unrealized_conversion_cast %[[ARG0]]
   //     CHECK-DAG: %[[ARG1_CAST:.+]] = builtin.unrealized_conversion_cast %[[ARG1]] : index to i32
   //     CHECK: %[[ZERO:.+]] = spirv.Constant 0 : i32
-  //     CHECK: %[[ONE:.+]] = spirv.Constant 1 : i32
-  //     CHECK: %[[UPDATE:.+]] = spirv.IMul %[[ONE]], %[[ARG1_CAST]] : i32
-  //     CHECK: %[[FLAT_IDX:.+]] = spirv.IAdd %[[ZERO]], %[[UPDATE]] : i32
   //     CHECK: %[[TWO:.+]] = spirv.Constant 2 : i32
   //     CHECK: %[[SIXTEEN:.+]] = spirv.Constant 16 : i32
-  //     CHECK: %[[IDX:.+]] = spirv.UMod %[[FLAT_IDX]], %[[TWO]] : i32
+  //     CHECK: %[[IDX:.+]] = spirv.UMod %[[ARG1_CAST]], %[[TWO]] : i32
   //     CHECK: %[[OFFSET:.+]] = spirv.IMul %[[IDX]], %[[SIXTEEN]] : i32
   //     CHECK: %[[MASK1:.+]] = spirv.Constant 65535 : i32
   //     CHECK: %[[TMP1:.+]] = spirv.ShiftLeftLogical %[[MASK1]], %[[OFFSET]] : i32, i32
   //     CHECK: %[[MASK:.+]] = spirv.Not %[[TMP1]] : i32
   //     CHECK: %[[CLAMPED_VAL:.+]] = spirv.BitwiseAnd %[[ARG2_CAST]], %[[MASK1]] : i32
   //     CHECK: %[[STORE_VAL:.+]] = spirv.ShiftLeftLogical %[[CLAMPED_VAL]], %[[OFFSET]] : i32, i32
-  //     CHECK: %[[ACCESS_IDX:.+]] = spirv.SDiv %[[FLAT_IDX]], %[[TWO]] : i32
+  //     CHECK: %[[ACCESS_IDX:.+]] = spirv.SDiv %[[ARG1_CAST]], %[[TWO]] : i32
   //     CHECK: %[[PTR:.+]] = spirv.AccessChain %[[ARG0_CAST]][%[[ZERO]], %[[ACCESS_IDX]]]
   //     CHECK: spirv.AtomicAnd <Device> <AcquireRelease> %[[PTR]], %[[MASK]]
   //     CHECK: spirv.AtomicOr <Device> <AcquireRelease> %[[PTR]], %[[STORE_VAL]]
@@ -222,15 +176,12 @@ module attributes {
 func.func @load_i4(%arg0: memref<?xi4, #spirv.storage_class<StorageBuffer>>, %i: index) -> i4 {
   // CHECK: %[[INDEX:.+]] = builtin.unrealized_conversion_cast %{{.+}} : index to i32
   // CHECK: %[[ZERO:.+]] = spirv.Constant 0 : i32
-  // CHECK: %[[ONE:.+]] = spirv.Constant 1 : i32
-  // CHECK: %[[MUL:.+]] = spirv.IMul %[[ONE]], %[[INDEX]] : i32
-  // CHECK: %[[OFFSET:.+]] = spirv.IAdd %[[ZERO]], %[[MUL]] : i32
   // CHECK: %[[EIGHT:.+]] = spirv.Constant 8 : i32
-  // CHECK: %[[QUOTIENT:.+]] = spirv.SDiv %[[OFFSET]], %[[EIGHT]] : i32
+  // CHECK: %[[QUOTIENT:.+]] = spirv.SDiv %[[INDEX]], %[[EIGHT]] : i32
   // CHECK: %[[PTR:.+]] = spirv.AccessChain %{{.+}}[%[[ZERO]], %[[QUOTIENT]]]
   // CHECK: %[[LOAD:.+]] = spirv.Load "StorageBuffer" %[[PTR]] : i32
   // CHECK: %[[FOUR:.+]] = spirv.Constant 4 : i32
-  // CHECK: %[[IDX:.+]] = spirv.UMod %[[OFFSET]], %[[EIGHT]] : i32
+  // CHECK: %[[IDX:.+]] = spirv.UMod %[[INDEX]], %[[EIGHT]] : i32
   // CHECK: %[[BITS:.+]] = spirv.IMul %[[IDX]], %[[FOUR]] : i32
   // CHECK: %[[VALUE:.+]] = spirv.ShiftRightArithmetic %[[LOAD]], %[[BITS]] : i32, i32
   // CHECK: %[[MASK:.+]] = spirv.Constant 15 : i32
@@ -248,19 +199,16 @@ func.func @store_i4(%arg0: memref<?xi4, #spirv.storage_class<StorageBuffer>>, %v
   // CHECK: %[[VAL:.+]] = builtin.unrealized_conversion_cast %{{.+}} : i4 to i32
   // CHECK: %[[INDEX:.+]] = builtin.unrealized_conversion_cast %{{.+}} : index to i32
   // CHECK: %[[ZERO:.+]] = spirv.Constant 0 : i32
-  // CHECK: %[[ONE:.+]] = spirv.Constant 1 : i32
-  // CHECK: %[[MUL:.+]] = spirv.IMul %[[ONE]], %[[INDEX]] : i32
-  // CHECK: %[[OFFSET:.+]] = spirv.IAdd %[[ZERO]], %[[MUL]] : i32
   // CHECK: %[[EIGHT:.+]] = spirv.Constant 8 : i32
-  // CHECK: %[[FOUR:.+]] = spirv.Constant [[OFFSET]] : i32
-  // CHECK: %[[IDX:.+]] = spirv.UMod %[[OFFSET]], %[[EIGHT]] : i32
+  // CHECK: %[[FOUR:.+]] = spirv.Constant 4 : i32
+  // CHECK: %[[IDX:.+]] = spirv.UMod %[[INDEX]], %[[EIGHT]] : i32
   // CHECK: %[[BITS:.+]] = spirv.IMul %[[IDX]], %[[FOUR]] : i32
   // CHECK: %[[MASK1:.+]] = spirv.Constant 15 : i32
   // CHECK: %[[SL:.+]] = spirv.ShiftLeftLogical %[[MASK1]], %[[BITS]] : i32, i32
   // CHECK: %[[MASK2:.+]] = spirv.Not %[[SL]] : i32
   // CHECK: %[[CLAMPED_VAL:.+]] = spirv.BitwiseAnd %[[VAL]], %[[MASK1]] : i32
   // CHECK: %[[STORE_VAL:.+]] = spirv.ShiftLeftLogical %[[CLAMPED_VAL]], %[[BITS]] : i32, i32
-  // CHECK: %[[ACCESS_INDEX:.+]] = spirv.SDiv %[[OFFSET]], %[[EIGHT]] : i32
+  // CHECK: %[[ACCESS_INDEX:.+]] = spirv.SDiv %[[INDEX]], %[[EIGHT]] : i32
   // CHECK: %[[PTR:.+]] = spirv.AccessChain %{{.+}}[%[[ZERO]], %[[ACCESS_INDEX]]]
   // CHECK: spirv.AtomicAnd <Device> <AcquireRelease> %[[PTR]], %[[MASK2]]
   // CHECK: spirv.AtomicOr <Device> <AcquireRelease> %[[PTR]], %[[STORE_VAL]]
@@ -283,16 +231,10 @@ module attributes {
 // INDEX64-LABEL: @load_i8
 func.func @load_i8(%arg0: memref<i8, #spirv.storage_class<StorageBuffer>>) -> i8 {
   //     CHECK: %[[ZERO:.+]] = spirv.Constant 0 : i32
-  //     CHECK: %[[FOUR:.+]] = spirv.Constant 4 : i32
-  //     CHECK: %[[QUOTIENT:.+]] = spirv.SDiv %[[ZERO]], %[[FOUR]] : i32
-  //     CHECK: %[[PTR:.+]] = spirv.AccessChain %{{.+}}[%[[ZERO]], %[[QUOTIENT]]]
+  //     CHECK: %[[PTR:.+]] = spirv.AccessChain %{{.+}}[%[[ZERO]], %[[ZERO]]]
   //     CHECK: %[[LOAD:.+]] = spirv.Load  "StorageBuffer" %[[PTR]]
-  //     CHECK: %[[EIGHT:.+]] = spirv.Constant 8 : i32
-  //     CHECK: %[[IDX:.+]] = spirv.UMod %[[ZERO]], %[[FOUR]] : i32
-  //     CHECK: %[[BITS:.+]] = spirv.IMul %[[IDX]], %[[EIGHT]] : i32
-  //     CHECK: %[[VALUE:.+]] = spirv.ShiftRightArithmetic %[[LOAD]], %[[BITS]] : i32, i32
   //     CHECK: %[[MASK:.+]] = spirv.Constant 255 : i32
-  //     CHECK: %[[T1:.+]] = spirv.BitwiseAnd %[[VALUE]], %[[MASK]] : i32
+  //     CHECK: %[[T1:.+]] = spirv.BitwiseAnd %[[LOAD]], %[[MASK]] : i32
   //     CHECK: %[[T2:.+]] = spirv.Constant 24 : i32
   //     CHECK: %[[T3:.+]] = spirv.ShiftLeftLogical %[[T1]], %[[T2]] : i32, i32
   //     CHECK: %[[SR:.+]] = spirv.ShiftRightArithmetic %[[T3]], %[[T2]] : i32, i32
@@ -300,16 +242,10 @@ func.func @load_i8(%arg0: memref<i8, #spirv.storage_class<StorageBuffer>>) -> i8
   //     CHECK: return %[[CAST]] : i8
 
   //   INDEX64: %[[ZERO:.+]] = spirv.Constant 0 : i64
-  //   INDEX64: %[[FOUR:.+]] = spirv.Constant 4 : i64
-  //   INDEX64: %[[QUOTIENT:.+]] = spirv.SDiv %[[ZERO]], %[[FOUR]] : i64
-  //   INDEX64: %[[PTR:.+]] = spirv.AccessChain %{{.+}}[%[[ZERO]], %[[QUOTIENT]]] : {{.+}}, i64, i64
+  //   INDEX64: %[[PTR:.+]] = spirv.AccessChain %{{.+}}[%[[ZERO]], %[[ZERO]]] : {{.+}}, i64, i64
   //   INDEX64: %[[LOAD:.+]] = spirv.Load  "StorageBuffer" %[[PTR]] : i32
-  //   INDEX64: %[[EIGHT:.+]] = spirv.Constant 8 : i64
-  //   INDEX64: %[[IDX:.+]] = spirv.UMod %[[ZERO]], %[[FOUR]] : i64
-  //   INDEX64: %[[BITS:.+]] = spirv.IMul %[[IDX]], %[[EIGHT]] : i64
-  //   INDEX64: %[[VALUE:.+]] = spirv.ShiftRightArithmetic %[[LOAD]], %[[BITS]] : i32, i64
   //   INDEX64: %[[MASK:.+]] = spirv.Constant 255 : i32
-  //   INDEX64: %[[T1:.+]] = spirv.BitwiseAnd %[[VALUE]], %[[MASK]] : i32
+  //   INDEX64: %[[T1:.+]] = spirv.BitwiseAnd %[[LOAD]], %[[MASK]] : i32
   //   INDEX64: %[[T2:.+]] = spirv.Constant 24 : i32
   //   INDEX64: %[[T3:.+]] = spirv.ShiftLeftLogical %[[T1]], %[[T2]] : i32, i32
   //   INDEX64: %[[SR:.+]] = spirv.ShiftRightArithmetic %[[T3]], %[[T2]] : i32, i32
@@ -326,37 +262,19 @@ func.func @load_i8(%arg0: memref<i8, #spirv.storage_class<StorageBuffer>>) -> i8
 func.func @store_i8(%arg0: memref<i8, #spirv.storage_class<StorageBuffer>>, %value: i8) {
   //     CHECK-DAG: %[[ARG0_CAST:.+]] = builtin.unrealized_conversion_cast %[[ARG0]]
   //     CHECK: %[[ZERO:.+]] = spirv.Constant 0 : i32
-  //     CHECK: %[[FOUR:.+]] = spirv.Constant 4 : i32
-  //     CHECK: %[[EIGHT:.+]] = spirv.Constant 8 : i32
-  //     CHECK: %[[IDX:.+]] = spirv.UMod %[[ZERO]], %[[FOUR]] : i32
-  //     CHECK: %[[OFFSET:.+]] = spirv.IMul %[[IDX]], %[[EIGHT]] : i32
-  //     CHECK: %[[MASK1:.+]] = spirv.Constant 255 : i32
-  //     CHECK: %[[TMP1:.+]] = spirv.ShiftLeftLogical %[[MASK1]], %[[OFFSET]] : i32, i32
-  //     CHECK: %[[MASK:.+]] = spirv.Not %[[TMP1]] : i32
+  //     CHECK: %[[MASK1:.+]] = spirv.Constant -256 : i32
   //     CHECK: %[[ARG1_CAST:.+]] = spirv.UConvert %[[ARG1]] : i8 to i32
-  //     CHECK: %[[CLAMPED_VAL:.+]] = spirv.BitwiseAnd %[[ARG1_CAST]], %[[MASK1]] : i32
-  //     CHECK: %[[STORE_VAL:.+]] = spirv.ShiftLeftLogical %[[CLAMPED_VAL]], %[[OFFSET]] : i32, i32
-  //     CHECK: %[[ACCESS_IDX:.+]] = spirv.SDiv %[[ZERO]], %[[FOUR]] : i32
-  //     CHECK: %[[PTR:.+]] = spirv.AccessChain %[[ARG0_CAST]][%[[ZERO]], %[[ACCESS_IDX]]]
-  //     CHECK: spirv.AtomicAnd <Device> <AcquireRelease> %[[PTR]], %[[MASK]]
-  //     CHECK: spirv.AtomicOr <Device> <AcquireRelease> %[[PTR]], %[[STORE_VAL]]
+  //     CHECK: %[[PTR:.+]] = spirv.AccessChain %[[ARG0_CAST]][%[[ZERO]], %[[ZERO]]]
+  //     CHECK: spirv.AtomicAnd <Device> <AcquireRelease> %[[PTR]], %[[MASK1]]
+  //     CHECK: spirv.AtomicOr <Device> <AcquireRelease> %[[PTR]], %[[ARG1_CAST]]
 
   //   INDEX64-DAG: %[[ARG0_CAST:.+]] = builtin.unrealized_conversion_cast %[[ARG0]]
   //   INDEX64: %[[ZERO:.+]] = spirv.Constant 0 : i64
-  //   INDEX64: %[[FOUR:.+]] = spirv.Constant 4 : i64
-  //   INDEX64: %[[EIGHT:.+]] = spirv.Constant 8 : i64
-  //   INDEX64: %[[IDX:.+]] = spirv.UMod %[[ZERO]], %[[FOUR]] : i64
-  //   INDEX64: %[[OFFSET:.+]] = spirv.IMul %[[IDX]], %[[EIGHT]] : i64
-  //   INDEX64: %[[MASK1:.+]] = spirv.Constant 255 : i32
-  //   INDEX64: %[[TMP1:.+]] = spirv.ShiftLeftLogical %[[MASK1]], %[[OFFSET]] : i32, i64
-  //   INDEX64: %[[MASK:.+]] = spirv.Not %[[TMP1]] : i32
+  //   INDEX64: %[[MASK1:.+]] = spirv.Constant -256 : i32
   //   INDEX64: %[[ARG1_CAST:.+]] = spirv.UConvert %[[ARG1]] : i8 to i32
-  //   INDEX64: %[[CLAMPED_VAL:.+]] = spirv.BitwiseAnd %[[ARG1_CAST]], %[[MASK1]] : i32
-  //   INDEX64: %[[STORE_VAL:.+]] = spirv.ShiftLeftLogical %[[CLAMPED_VAL]], %[[OFFSET]] : i32, i64
-  //   INDEX64: %[[ACCESS_IDX:.+]] = spirv.SDiv %[[ZERO]], %[[FOUR]] : i64
-  //   INDEX64: %[[PTR:.+]] = spirv.AccessChain %[[ARG0_CAST]][%[[ZERO]], %[[ACCESS_IDX]]] : {{.+}}, i64, i64
-  //   INDEX64: spirv.AtomicAnd <Device> <AcquireRelease> %[[PTR]], %[[MASK]]
-  //   INDEX64: spirv.AtomicOr <Device> <AcquireRelease> %[[PTR]], %[[STORE_VAL]]
+  //   INDEX64: %[[PTR:.+]] = spirv.AccessChain %[[ARG0_CAST]][%[[ZERO]], %[[ZERO]]] : {{.+}}, i64, i64
+  //   INDEX64: spirv.AtomicAnd <Device> <AcquireRelease> %[[PTR]], %[[MASK1]]
+  //   INDEX64: spirv.AtomicOr <Device> <AcquireRelease> %[[PTR]], %[[ARG1_CAST]]
   memref.store %value, %arg0[] : memref<i8, #spirv.storage_class<StorageBuffer>>
   return
 }
diff --git a/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir b/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir
index feb6d4e92401..10c03a270005 100644
--- a/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir
+++ b/mlir/test/Conversion/MemRefToSPIRV/memref-to-spirv.mlir
@@ -70,11 +70,8 @@ func.func @load_store_unknown_dim(%i: index, %source: memref<?xi32, #spirv.stora
 func.func @load_i1(%src: memref<4xi1, #spirv.storage_class<StorageBuffer>>, %i : index) -> i1 {
   // CHECK-DAG: %[[SRC_CAST:.+]] = builtin.unrealized_conversion_cast %[[SRC]] : memref<4xi1, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<4 x i8, stride=1> [0])>, StorageBuffer>
   // CHECK-DAG: %[[IDX_CAST:.+]] = builtin.unrealized_conversion_cast %[[IDX]]
-  // CHECK: %[[ZERO:.+]] = spirv.Constant 0 : i32
-  // CHECK: %[[ONE:.+]] = spirv.Constant 1 : i32
-  // CHECK: %[[MUL:.+]] = spirv.IMul %[[ONE]], %[[IDX_CAST]] : i32
-  // CHECK: %[[ADD:.+]] = spirv.IAdd %[[ZERO]], %[[MUL]] : i32
-  // CHECK: %[[ADDR:.+]] = spirv.AccessChain %[[SRC_CAST]][%[[ZERO]], %[[ADD]]]
+  // CHECK: %[[ZERO:.*]] = spirv.Constant 0 : i32
+  // CHECK: %[[ADDR:.+]] = spirv.AccessChain %[[SRC_CAST]][%[[ZERO]], %[[IDX_CAST]]]
   // CHECK: %[[VAL:.+]] = spirv.Load "StorageBuffer" %[[ADDR]] : i8
   // CHECK: %[[ONE_I8:.+]] = spirv.Constant 1 : i8
   // CHECK: %[[BOOL:.+]] = spirv.IEqual %[[VAL]], %[[ONE_I8]] : i8
@@ -90,15 +87,10 @@ func.func @store_i1(%dst: memref<4xi1, #spirv.storage_class<StorageBuffer>>, %i:
   %true = arith.constant true
   // CHECK-DAG: %[[DST_CAST:.+]] = builtin.unrealized_conversion_cast %[[DST]] : memref<4xi1, #spirv.storage_class<StorageBuffer>> to !spirv.ptr<!spirv.struct<(!spirv.array<4 x i8, stride=1> [0])>, StorageBuffer>
   // CHECK-DAG: %[[IDX_CAST:.+]] = builtin.unrealized_conversion_cast %[[IDX]]
-  // CHECK: %[[ZERO:.+]] = spirv.Constant 0 : i32
-  // CHECK: %[[ONE:.+]] = spirv.Constant 1 : i32
-  // CHECK: %[[MUL:.+]] = spirv.IMul %[[ONE]], %[[IDX_CAST]] : i32
-  // CHECK: %[[ADD:.+]] = spirv.IAdd %[[ZERO]], %[[MUL]] : i32
-  // CHECK: %[[ADDR:.+]] = spirv.AccessChain %[[DST_CAST]][%[[ZERO]], %[[ADD]]]
-  // CHECK: %[[ZERO_I8:.+]] = spirv.Constant 0 : i8
+  // CHECK: %[[ZERO:.*]] = spirv.Constant 0 : i32
+  // CHECK: %[[ADDR:.+]] = spirv.AccessChain %[[DST_CAST]][%[[ZERO]], %[[IDX_CAST]]]
   // CHECK: %[[ONE_I8:.+]] = spirv.Constant 1 : i8
-  // CHECK: %[[RES:.+]] = spirv.Select %{{.+}}, %[[ONE_I8]], %[[ZERO_I8]] : i1, i8
-  // CHECK: spirv.Store "StorageBuffer" %[[ADDR]], %[[RES]] : i8
+  // CHECK: spirv.Store "StorageBuffer" %[[ADDR]], %[[ONE_I8]] : i8
   memref.store %true, %dst[%i]: memref<4xi1, #spirv.storage_class<StorageBuffer>>
   return
 }
@@ -234,11 +226,7 @@ func.func @load_store_unknown_dim(%i: index, %source: memref<?xi32, #spirv.stora
 func.func @load_i1(%src: memref<4xi1, #spirv.storage_class<CrossWorkgroup>>, %i : index) -> i1 {
   // CHECK-DAG: %[[SRC_CAST:.+]] = builtin.unrealized_conversion_cast %[[SRC]] : memref<4xi1, #spirv.storage_class<CrossWorkgroup>> to !spirv.ptr<!spirv.array<4 x i8>, CrossWorkgroup>
   // CHECK-DAG: %[[IDX_CAST:.+]] = builtin.unrealized_conversion_cast %[[IDX]]
-  // CHECK: %[[ZERO:.+]] = spirv.Constant 0 : i32
-  // CHECK: %[[ONE:.+]] = spirv.Constant 1 : i32
-  // CHECK: %[[MUL:.+]] = spirv.IMul %[[ONE]], %[[IDX_CAST]] : i32
-  // CHECK: %[[ADD:.+]] = spirv.IAdd %[[ZERO]], %[[MUL]] : i32
-  // CHECK: %[[ADDR:.+]] = spirv.AccessChain %[[SRC_CAST]][%[[ADD]]]
+  // CHECK: %[[ADDR:.+]] = spirv.AccessChain %[[SRC_CAST]][%[[IDX_CAST]]]
   // CHECK: %[[VAL:.+]] = spirv.Load "CrossWorkgroup" %[[ADDR]] : i8
   // CHECK: %[[ONE_I8:.+]] = spirv.Constant 1 : i8
   // CHECK: %[[BOOL:.+]] = spirv.IEqual %[[VAL]], %[[ONE_I8]] : i8
@@ -254,15 +242,9 @@ func.func @store_i1(%dst: memref<4xi1, #spirv.storage_class<CrossWorkgroup>>, %i
   %true = arith.constant true
   // CHECK-DAG: %[[DST_CAST:.+]] = builtin.unrealized_conversion_cast %[[DST]] : memref<4xi1, #spirv.storage_class<CrossWorkgroup>> to !spirv.ptr<!spirv.array<4 x i8>, CrossWorkgroup>
   // CHECK-DAG: %[[IDX_CAST:.+]] = builtin.unrealized_conversion_cast %[[IDX]]
-  // CHECK: %[[ZERO:.+]] = spirv.Constant 0 : i32
-  // CHECK: %[[ONE:.+]] = spirv.Constant 1 : i32
-  // CHECK: %[[MUL:.+]] = spirv.IMul %[[ONE]], %[[IDX_CAST]] : i32
-  // CHECK: %[[ADD:.+]] = spirv.IAdd %[[ZERO]], %[[MUL]] : i32
-  // CHECK: %[[ADDR:.+]] = spirv.AccessChain %[[DST_CAST]][%[[ADD]]]
-  // CHECK: %[[ZERO_I8:.+]] = spirv.Constant 0 : i8
+  // CHECK: %[[ADDR:.+]] = spirv.AccessChain %[[DST_CAST]][%[[IDX_CAST]]]
   // CHECK: %[[ONE_I8:.+]] = spirv.Constant 1 : i8
-  // CHECK: %[[RES:.+]] = spirv.Select %{{.+}}, %[[ONE_I8]], %[[ZERO_I8]] : i1, i8
-  // CHECK: spirv.Store "CrossWorkgroup" %[[ADDR]], %[[RES]] : i8
+  // CHECK: spirv.Store "CrossWorkgroup" %[[ADDR]], %[[ONE_I8]] : i8
   memref.store %true, %dst[%i]: memref<4xi1, #spirv.storage_class<CrossWorkgroup>>
   return
 }
diff --git a/mlir/test/Conversion/SCFToSPIRV/for.mlir b/mlir/test/Conversion/SCFToSPIRV/for.mlir
index 02558463b866..81661ec7a3a0 100644
--- a/mlir/test/Conversion/SCFToSPIRV/for.mlir
+++ b/mlir/test/Conversion/SCFToSPIRV/for.mlir
@@ -19,17 +19,9 @@ func.func @loop_kernel(%arg2 : memref<10xf32, #spirv.storage_class<StorageBuffer
   // CHECK:        spirv.BranchConditional %[[CMP]], ^[[BODY:.*]], ^[[MERGE:.*]]
   // CHECK:      ^[[BODY]]:
   // CHECK:        %[[ZERO1:.*]] = spirv.Constant 0 : i32
-  // CHECK:        %[[OFFSET1:.*]] = spirv.Constant 0 : i32
-  // CHECK:        %[[STRIDE1:.*]] = spirv.Constant 1 : i32
-  // CHECK:        %[[UPDATE1:.*]] = spirv.IMul %[[STRIDE1]], %[[INDVAR]] : i32
-  // CHECK:        %[[INDEX1:.*]] = spirv.IAdd %[[OFFSET1]], %[[UPDATE1]] : i32
-  // CHECK:        spirv.AccessChain {{%.*}}{{\[}}%[[ZERO1]], %[[INDEX1]]{{\]}}
+  // CHECK:        spirv.AccessChain {{%.*}}{{\[}}%[[ZERO1]], %[[INDVAR]]{{\]}}
   // CHECK:        %[[ZERO2:.*]] = spirv.Constant 0 : i32
-  // CHECK:        %[[OFFSET2:.*]] = spirv.Constant 0 : i32
-  // CHECK:        %[[STRIDE2:.*]] = spirv.Constant 1 : i32
-  // CHECK:        %[[UPDATE2:.*]] = spirv.IMul %[[STRIDE2]], %[[INDVAR]] : i32
-  // CHECK:        %[[INDEX2:.*]] = spirv.IAdd %[[OFFSET2]], %[[UPDATE2]] : i32
-  // CHECK:        spirv.AccessChain {{%.*}}[%[[ZERO2]], %[[INDEX2]]]
+  // CHECK:        spirv.AccessChain {{%.*}}[%[[ZERO2]], %[[INDVAR]]]
   // CHECK:        %[[INCREMENT:.*]] = spirv.IAdd %[[INDVAR]], %[[STEP]] : i32
   // CHECK:        spirv.Branch ^[[HEADER]](%[[INCREMENT]] : i32)
   // CHECK:      ^[[MERGE]]
diff --git a/mlir/test/Conversion/TensorToSPIRV/tensor-ops-to-spirv.mlir b/mlir/test/Conversion/TensorToSPIRV/tensor-ops-to-spirv.mlir
index 19de613bf5b0..32d0fbea65b1 100644
--- a/mlir/test/Conversion/TensorToSPIRV/tensor-ops-to-spirv.mlir
+++ b/mlir/test/Conversion/TensorToSPIRV/tensor-ops-to-spirv.mlir
@@ -14,14 +14,12 @@ func.func @tensor_extract_constant(%a : index, %b: index, %c: index) -> i32 {
   // CHECK: spirv.Store "Function" %[[VAR]], %[[CST]] : !spirv.array<12 x i32>
   // CHECK: %[[C0:.+]] = spirv.Constant 0 : i32
   // CHECK: %[[C6:.+]] = spirv.Constant 6 : i32
-  // CHECK: %[[MUL0:.+]] = spirv.IMul %[[C6]], %[[A]] : i32
-  // CHECK: %[[ADD0:.+]] = spirv.IAdd %[[C0]], %[[MUL0]] : i32
+  // CHECK: %[[MUL0:.+]] = spirv.IMul %[[A]], %[[C6]] : i32
   // CHECK: %[[C3:.+]] = spirv.Constant 3 : i32
-  // CHECK: %[[MUL1:.+]] = spirv.IMul %[[C3]], %[[B]] : i32
-  // CHECK: %[[ADD1:.+]] = spirv.IAdd %[[ADD0]], %[[MUL1]] : i32
+  // CHECK: %[[MUL1:.+]] = spirv.IMul %[[B]], %[[C3]] : i32
+  // CHECK: %[[ADD1:.+]] = spirv.IAdd %[[MUL1]], %[[MUL0]] : i32
   // CHECK: %[[C1:.+]] = spirv.Constant 1 : i32
-  // CHECK: %[[MUL2:.+]] = spirv.IMul %[[C1]], %[[C]] : i32
-  // CHECK: %[[ADD2:.+]] = spirv.IAdd %[[ADD1]], %[[MUL2]] : i32
+  // CHECK: %[[ADD2:.+]] = spirv.IAdd %[[C]], %[[ADD1]] : i32
   // CHECK: %[[AC:.+]] = spirv.AccessChain %[[VAR]][%[[ADD2]]]
   // CHECK: %[[VAL:.+]] = spirv.Load "Function" %[[AC]] : i32
   %extract = tensor.extract %cst[%a, %b, %c] : tensor<2x2x3xi32>
diff --git a/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir b/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
index c9984091d5ac..cddc4ee38535 100644
--- a/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
+++ b/mlir/test/Conversion/VectorToSPIRV/vector-to-spirv.mlir
@@ -720,9 +720,7 @@ module attributes {
 //       CHECK:   %[[CST1:.+]] = spirv.Constant 0 : i32
 //       CHECK:   %[[CST2:.+]] = spirv.Constant 0 : i32
 //       CHECK:   %[[CST3:.+]] = spirv.Constant 1 : i32
-//       CHECK:   %[[S2:.+]] = spirv.IMul %[[CST3]], %[[S1]] : i32
-//       CHECK:   %[[S3:.+]] = spirv.IAdd %[[CST2]], %[[S2]] : i32
-//       CHECK:   %[[S4:.+]] = spirv.AccessChain %[[S0]][%[[CST1]], %[[S3]]] : !spirv.ptr<!spirv.struct<(!spirv.array<4 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
+//       CHECK:   %[[S4:.+]] = spirv.AccessChain %[[S0]][%[[CST1]], %[[S1]]] : !spirv.ptr<!spirv.struct<(!spirv.array<4 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
 //       CHECK:   %[[S5:.+]] = spirv.Bitcast %[[S4]] : !spirv.ptr<f32, StorageBuffer> to !spirv.ptr<vector<4xf32>, StorageBuffer>
 //       CHECK:   %[[R0:.+]] = spirv.Load "StorageBuffer" %[[S5]] : vector<4xf32>
 //       CHECK:   return %[[R0]] : vector<4xf32>
@@ -743,11 +741,9 @@ func.func @vector_load(%arg0 : memref<4xf32, #spirv.storage_class<StorageBuffer>
 //       CHECK:   %[[CST0_1:.+]] = spirv.Constant 0 : i32
 //       CHECK:   %[[CST0_2:.+]] = spirv.Constant 0 : i32
 //       CHECK:   %[[CST4:.+]] = spirv.Constant 4 : i32
-//       CHECK:   %[[S3:.+]] = spirv.IMul %[[CST4]], %[[S1]] : i32
-//       CHECK:   %[[S4:.+]] = spirv.IAdd %[[CST0_2]], %[[S3]] : i32
+//       CHECK:   %[[S3:.+]] = spirv.IMul %[[S1]], %[[CST4]] : i32
 //       CHECK:   %[[CST1:.+]] = spirv.Constant 1 : i32
-//       CHECK:   %[[S5:.+]] = spirv.IMul %[[CST1]], %[[S2]] : i32
-//       CHECK:   %[[S6:.+]] = spirv.IAdd %[[S4]], %[[S5]] : i32
+//       CHECK:   %[[S6:.+]] = spirv.IAdd  %[[S2]], %[[S3]] : i32
 //       CHECK:   %[[S7:.+]] = spirv.AccessChain %[[S0]][%[[CST0_1]], %[[S6]]] : !spirv.ptr<!spirv.struct<(!spirv.array<16 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
 //       CHECK:   %[[S8:.+]] = spirv.Bitcast %[[S7]] : !spirv.ptr<f32, StorageBuffer> to !spirv.ptr<vector<4xf32>, StorageBuffer>
 //       CHECK:   %[[R0:.+]] = spirv.Load "StorageBuffer" %[[S8]] : vector<4xf32>
@@ -768,9 +764,7 @@ func.func @vector_load_2d(%arg0 : memref<4x4xf32, #spirv.storage_class<StorageBu
 //       CHECK:   %[[CST1:.+]] = spirv.Constant 0 : i32
 //       CHECK:   %[[CST2:.+]] = spirv.Constant 0 : i32
 //       CHECK:   %[[CST3:.+]] = spirv.Constant 1 : i32
-//       CHECK:   %[[S2:.+]] = spirv.IMul %[[CST3]], %[[S1]] : i32
-//       CHECK:   %[[S3:.+]] = spirv.IAdd %[[CST2]], %[[S2]] : i32
-//       CHECK:   %[[S4:.+]] = spirv.AccessChain %[[S0]][%[[CST1]], %[[S3]]] : !spirv.ptr<!spirv.struct<(!spirv.array<4 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
+//       CHECK:   %[[S4:.+]] = spirv.AccessChain %[[S0]][%[[CST1]], %[[S1]]] : !spirv.ptr<!spirv.struct<(!spirv.array<4 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
 //       CHECK:   %[[S5:.+]] = spirv.Bitcast %[[S4]] : !spirv.ptr<f32, StorageBuffer> to !spirv.ptr<vector<4xf32>, StorageBuffer>
 //       CHECK:   spirv.Store "StorageBuffer" %[[S5]], %[[ARG1]] : vector<4xf32>
 func.func @vector_store(%arg0 : memref<4xf32, #spirv.storage_class<StorageBuffer>>, %arg1 : vector<4xf32>) {
@@ -790,11 +784,9 @@ func.func @vector_store(%arg0 : memref<4xf32, #spirv.storage_class<StorageBuffer
 //       CHECK:   %[[CST0_1:.+]] = spirv.Constant 0 : i32
 //       CHECK:   %[[CST0_2:.+]] = spirv.Constant 0 : i32
 //       CHECK:   %[[CST4:.+]] = spirv.Constant 4 : i32
-//       CHECK:   %[[S3:.+]] = spirv.IMul %[[CST4]], %[[S1]] : i32
-//       CHECK:   %[[S4:.+]] = spirv.IAdd %[[CST0_2]], %[[S3]] : i32
+//       CHECK:   %[[S3:.+]] = spirv.IMul %[[S1]], %[[CST4]] : i32
 //       CHECK:   %[[CST1:.+]] = spirv.Constant 1 : i32
-//       CHECK:   %[[S5:.+]] = spirv.IMul %[[CST1]], %[[S2]] : i32
-//       CHECK:   %[[S6:.+]] = spirv.IAdd %[[S4]], %[[S5]] : i32
+//       CHECK:   %[[S6:.+]] = spirv.IAdd %[[S2]], %[[S3]] : i32
 //       CHECK:   %[[S7:.+]] = spirv.AccessChain %[[S0]][%[[CST0_1]], %[[S6]]] : !spirv.ptr<!spirv.struct<(!spirv.array<16 x f32, stride=4> [0])>, StorageBuffer>, i32, i32
 //       CHECK:   %[[S8:.+]] = spirv.Bitcast %[[S7]] : !spirv.ptr<f32, StorageBuffer> to !spirv.ptr<vector<4xf32>, StorageBuffer>
 //       CHECK:   spirv.Store "StorageBuffer" %[[S8]], %[[ARG1]] : vector<4xf32>
diff --git a/mlir/test/Dialect/Arith/one-shot-bufferize.mlir b/mlir/test/Dialect/Arith/one-shot-bufferize.mlir
index 174bf2fc8e4b..f6bdca7f4d9e 100644
--- a/mlir/test/Dialect/Arith/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Arith/one-shot-bufferize.mlir
@@ -1,9 +1,9 @@
 // RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries" -split-input-file | FileCheck %s
 
 // Run fuzzer with different seeds.
-// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=23 bufferize-function-boundaries" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=59 bufferize-function-boundaries" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=91 bufferize-function-boundaries" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23 bufferize-function-boundaries" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=59 bufferize-function-boundaries" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=91 bufferize-function-boundaries" -split-input-file -o /dev/null
 
 // Test bufferization using memref types that have no layout map.
 // RUN: mlir-opt %s -one-shot-bufferize="unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries" -split-input-file -o /dev/null
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-allow-return-allocs.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-allow-return-allocs.mlir
index e4375950d336..8f0170b17381 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-allow-return-allocs.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-allow-return-allocs.mlir
@@ -1,9 +1,9 @@
 // RUN: mlir-opt %s -one-shot-bufferize="allow-unknown-ops" -canonicalize -split-input-file | FileCheck %s
 
 // Run fuzzer with different seeds.
-// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=23" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=59" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=91" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=59" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=91" -split-input-file -o /dev/null
 
 // CHECK-LABEL: func @buffer_not_deallocated(
 //  CHECK-SAME:     %[[t:.*]]: tensor<?xf32>
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-analysis-bottom-up-from-terminators.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-analysis-bottom-up-from-terminators.mlir
new file mode 100644
index 000000000000..1b75edc4c157
--- /dev/null
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-analysis-bottom-up-from-terminators.mlir
@@ -0,0 +1,36 @@
+// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=bottom-up-from-terminators" -split-input-file | FileCheck %s
+
+// CHECK-LABEL: func @simple_test(
+func.func @simple_test(%lb: index, %ub: index, %step: index, %f1: f32, %f2: f32) -> (tensor<5xf32>, tensor<5xf32>) {
+  %c0 = arith.constant 0 : index
+  %p = arith.constant 0.0 : f32
+
+  // Make sure that ops that feed into region terminators bufferize in-place
+  // (if possible).
+  // Note: This test case fails to bufferize with a "top-down" or "bottom-up"
+  // heuristic.
+
+  %0 = tensor.empty() : tensor<5xf32>
+  %1 = scf.for %iv = %lb to %ub step %step iter_args(%t = %0) -> (tensor<5xf32>) {
+    // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]}
+    %2 = linalg.fill ins(%f1 : f32) outs(%t : tensor<5xf32>) -> tensor<5xf32>
+    // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
+    %3 = linalg.fill ins(%f2 : f32) outs(%t : tensor<5xf32>) -> tensor<5xf32>
+    %4 = vector.transfer_read %2[%c0], %p : tensor<5xf32>, vector<5xf32>
+    vector.print %4 : vector<5xf32>
+    scf.yield %3 : tensor<5xf32>
+  }
+
+  %5 = tensor.empty() : tensor<5xf32>
+  %6 = scf.for %iv = %lb to %ub step %step iter_args(%t = %0) -> (tensor<5xf32>) {
+    // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
+    %7 = linalg.fill ins(%f1 : f32) outs(%t : tensor<5xf32>) -> tensor<5xf32>
+    // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]}
+    %8 = linalg.fill ins(%f2 : f32) outs(%t : tensor<5xf32>) -> tensor<5xf32>
+    %9 = vector.transfer_read %8[%c0], %p : tensor<5xf32>, vector<5xf32>
+    vector.print %9 : vector<5xf32>
+    scf.yield %7 : tensor<5xf32>
+  }
+
+  return %1, %6 : tensor<5xf32>, tensor<5xf32>
+}
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
index 2c5f2083f589..9380c81ce235 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
@@ -4,9 +4,9 @@
 // RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-unknown-ops unknown-type-conversion=identity-layout-map" -split-input-file | FileCheck %s --check-prefix=CHECK-NO-LAYOUT-MAP
 
 // Run fuzzer with different seeds.
-// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=23" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=59" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=91" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=59" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=91" -split-input-file -o /dev/null
 
 // RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="dialect-filter=tensor,bufferization allow-unknown-ops" -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-TENSOR
 // RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="dialect-filter=scf,bufferization allow-unknown-ops" -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-SCF
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir
index 611b67e198c0..0ed3a9f077ce 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir
@@ -1,9 +1,9 @@
 // RUN: mlir-opt %s -one-shot-bufferize="allow-unknown-ops" -verify-diagnostics -split-input-file | FileCheck %s
 
 // Run fuzzer with different seeds.
-// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=23" -verify-diagnostics -split-input-file -o /dev/null
-// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=59" -verify-diagnostics -split-input-file -o /dev/null
-// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=91" -verify-diagnostics -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23" -verify-diagnostics -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=59" -verify-diagnostics -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=91" -verify-diagnostics -split-input-file -o /dev/null
 
 // Run with top-down analysis.
 // RUN: mlir-opt %s -one-shot-bufferize="allow-unknown-ops analysis-heuristic=top-down" -verify-diagnostics -split-input-file | FileCheck %s --check-prefix=CHECK-TOP-DOWN-ANALYSIS
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-allow-return-allocs.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-allow-return-allocs.mlir
index 9319ac61d928..c58b153d438c 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-allow-return-allocs.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-allow-return-allocs.mlir
@@ -2,9 +2,9 @@
 // RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries=1 " -split-input-file | FileCheck %s --check-prefix=NO-DROP
 
 // Run fuzzer with different seeds.
-// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries=1 test-analysis-only analysis-fuzzer-seed=23" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries=1 test-analysis-only analysis-fuzzer-seed=59" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries=1 test-analysis-only analysis-fuzzer-seed=91" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries=1 test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries=1 test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=59" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries=1 test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=91" -split-input-file -o /dev/null
 
 // Test bufferization using memref types that have no layout map.
 // RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries=1 unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map" -split-input-file -o /dev/null
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir
index 6e7b113aa35c..42d9cc00d3ff 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir
@@ -1,9 +1,14 @@
 // RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only" -split-input-file | FileCheck %s
 
 // Run fuzzer with different seeds.
-// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only analysis-fuzzer-seed=23" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only analysis-fuzzer-seed=59" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only analysis-fuzzer-seed=91" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=59" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=91" -split-input-file -o /dev/null
+
+// Try different heuristics. Not checking the result, just make sure that we do
+// not crash.
+// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only analysis-heuristic=bottom-up-from-terminators" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only analysis-heuristic=top-down" -split-input-file -o /dev/null
 
 // TODO: Extract op-specific test cases and move them to their respective
 // dialects.
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
index 39f4835b28ff..429c9e4dea9e 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
@@ -2,9 +2,9 @@
 // RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries=1" -canonicalize -drop-equivalent-buffer-results -split-input-file | FileCheck %s
 
 // Run fuzzer with different seeds.
-// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries=1 test-analysis-only analysis-fuzzer-seed=23" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries=1 test-analysis-only analysis-fuzzer-seed=59" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries=1 test-analysis-only analysis-fuzzer-seed=91" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries=1 test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries=1 test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=59" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries=1 test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=91" -split-input-file -o /dev/null
 
 // Test bufferization using memref types that have no layout map.
 // RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries=1 unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map" -split-input-file | FileCheck %s --check-prefix=CHECK-NO-LAYOUT-MAP
diff --git a/mlir/test/Dialect/EmitC/transforms.mlir b/mlir/test/Dialect/EmitC/transforms.mlir
index ad167fa455a1..8ac606a2c8c0 100644
--- a/mlir/test/Dialect/EmitC/transforms.mlir
+++ b/mlir/test/Dialect/EmitC/transforms.mlir
@@ -107,3 +107,20 @@ func.func @expression_with_address_taken(%arg0: i32, %arg1: i32, %arg2: !emitc.p
   %d = emitc.cmp lt, %c, %arg2 :(!emitc.ptr<i32>, !emitc.ptr<i32>) -> i1
   return %d : i1
 }
+
+// CHECK-LABEL: func.func @no_nested_expression(
+// CHECK-SAME:      %[[VAL_0:.*]]: i32, %[[VAL_1:.*]]: i32) -> i1 {
+// CHECK:         %[[VAL_2:.*]] = emitc.expression : i1 {
+// CHECK:           %[[VAL_3:.*]] = emitc.cmp lt, %[[VAL_0]], %[[VAL_1]] : (i32, i32) -> i1
+// CHECK:           emitc.yield %[[VAL_3]] : i1
+// CHECK:         }
+// CHECK:         return %[[VAL_2]] : i1
+// CHECK:       }
+
+func.func @no_nested_expression(%arg0: i32, %arg1: i32) -> i1 {
+  %a = emitc.expression : i1 {
+    %b = emitc.cmp lt, %arg0, %arg1 :(i32, i32) -> i1
+    emitc.yield %b : i1
+  }
+  return %a : i1
+}
diff --git a/mlir/test/Dialect/LLVMIR/layout.mlir b/mlir/test/Dialect/LLVMIR/layout.mlir
index 2868e1740f86..a78fb771242e 100644
--- a/mlir/test/Dialect/LLVMIR/layout.mlir
+++ b/mlir/test/Dialect/LLVMIR/layout.mlir
@@ -7,6 +7,7 @@ module {
     // CHECK: alloca_memory_space = 0
     // CHECK: bitsize = 64
     // CHECK: global_memory_space = 0
+    // CHECK: index = 64
     // CHECK: preferred = 8
     // CHECK: program_memory_space = 0
     // CHECK: size = 8
@@ -16,6 +17,7 @@ module {
     // CHECK: alloca_memory_space = 0
     // CHECK: bitsize = 64
     // CHECK: global_memory_space = 0
+    // CHECK: index = 64
     // CHECK: preferred = 8
     // CHECK: program_memory_space = 0
     // CHECK: size = 8
@@ -25,6 +27,7 @@ module {
     // CHECK: alloca_memory_space = 0
     // CHECK: bitsize = 64
     // CHECK: global_memory_space = 0
+    // CHECK: index = 64
     // CHECK: preferred = 8
     // CHECK: program_memory_space = 0
     // CHECK: size = 8
@@ -39,7 +42,7 @@ module {
 module attributes { dlti.dl_spec = #dlti.dl_spec<
   #dlti.dl_entry<!llvm.ptr, dense<[32, 32, 64]> : vector<3xi64>>,
   #dlti.dl_entry<!llvm.ptr<5>, dense<[64, 64, 64]> : vector<3xi64>>,
-  #dlti.dl_entry<!llvm.ptr<4>, dense<[32, 64, 64]> : vector<3xi64>>,
+  #dlti.dl_entry<!llvm.ptr<4>, dense<[32, 64, 64, 24]> : vector<4xi64>>,
   #dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui64>,
   #dlti.dl_entry<"dlti.global_memory_space", 2 : ui64>,
   #dlti.dl_entry<"dlti.program_memory_space", 3 : ui64>,
@@ -51,6 +54,7 @@ module attributes { dlti.dl_spec = #dlti.dl_spec<
     // CHECK: alloca_memory_space = 5
     // CHECK: bitsize = 32
     // CHECK: global_memory_space = 2
+    // CHECK: index = 32
     // CHECK: preferred = 8
     // CHECK: program_memory_space = 3
     // CHECK: size = 4
@@ -60,6 +64,7 @@ module attributes { dlti.dl_spec = #dlti.dl_spec<
     // CHECK: alloca_memory_space = 5
     // CHECK: bitsize = 32
     // CHECK: global_memory_space = 2
+    // CHECK: index = 32
     // CHECK: preferred = 8
     // CHECK: program_memory_space = 3
     // CHECK: size = 4
@@ -69,24 +74,17 @@ module attributes { dlti.dl_spec = #dlti.dl_spec<
     // CHECK: alloca_memory_space = 5
     // CHECK: bitsize = 64
     // CHECK: global_memory_space = 2
+    // CHECK: index = 64
     // CHECK: preferred = 8
     // CHECK: program_memory_space = 3
     // CHECK: size = 8
     // CHECK: stack_alignment = 128
     "test.data_layout_query"() : () -> !llvm.ptr<5>
-    // CHECK: alignment = 4
-    // CHECK: alloca_memory_space = 5
-    // CHECK: bitsize = 32
-    // CHECK: global_memory_space = 2
-    // CHECK: preferred = 8
-    // CHECK: program_memory_space = 3
-    // CHECK: size = 4
-    // CHECK: stack_alignment = 128
-    "test.data_layout_query"() : () -> !llvm.ptr<3>
     // CHECK: alignment = 8
     // CHECK: alloca_memory_space = 5
     // CHECK: bitsize = 32
     // CHECK: global_memory_space = 2
+    // CHECK: index = 24
     // CHECK: preferred = 8
     // CHECK: program_memory_space = 3
     // CHECK: size = 4
@@ -134,6 +132,7 @@ module {
         // simple case
         // CHECK: alignment = 4
         // CHECK: bitsize = 32
+        // CHECK: index = 0
         // CHECK: preferred = 4
         // CHECK: size = 4
         "test.data_layout_query"() : () -> !llvm.struct<(i32)>
@@ -141,6 +140,7 @@ module {
         // padding inbetween
         // CHECK: alignment = 8
         // CHECK: bitsize = 128
+        // CHECK: index = 0
         // CHECK: preferred = 8
         // CHECK: size = 16
         "test.data_layout_query"() : () -> !llvm.struct<(i32, f64)>
@@ -148,6 +148,7 @@ module {
         // padding at end of struct
         // CHECK: alignment = 8
         // CHECK: bitsize = 128
+        // CHECK: index = 0
         // CHECK: preferred = 8
         // CHECK: size = 16
         "test.data_layout_query"() : () -> !llvm.struct<(f64, i32)>
@@ -155,6 +156,7 @@ module {
          // packed
          // CHECK: alignment = 1
          // CHECK: bitsize = 96
+         // CHECK: index = 0
          // CHECK: preferred = 8
          // CHECK: size = 12
          "test.data_layout_query"() : () -> !llvm.struct<packed (f64, i32)>
@@ -162,6 +164,7 @@ module {
          // empty
          // CHECK: alignment = 1
          // CHECK: bitsize = 0
+         // CHECK: index = 0
          // CHECK: preferred = 1
          // CHECK: size = 0
          "test.data_layout_query"() : () -> !llvm.struct<()>
@@ -179,6 +182,7 @@ module attributes { dlti.dl_spec = #dlti.dl_spec<
         // Strict alignment is applied
         // CHECK: alignment = 4
         // CHECK: bitsize = 16
+        // CHECK: index = 0
         // CHECK: preferred = 4
         // CHECK: size = 2
         "test.data_layout_query"() : () -> !llvm.struct<(i16)>
@@ -186,6 +190,7 @@ module attributes { dlti.dl_spec = #dlti.dl_spec<
         // No impact on structs that have stricter requirements
         // CHECK: alignment = 8
         // CHECK: bitsize = 128
+        // CHECK: index = 0
         // CHECK: preferred = 8
         // CHECK: size = 16
         "test.data_layout_query"() : () -> !llvm.struct<(i32, f64)>
@@ -193,6 +198,7 @@ module attributes { dlti.dl_spec = #dlti.dl_spec<
          // Only the preferred alignment of structs is affected
          // CHECK: alignment = 1
          // CHECK: bitsize = 32
+         // CHECK: index = 0
          // CHECK: preferred = 4
          // CHECK: size = 4
          "test.data_layout_query"() : () -> !llvm.struct<packed (i16, i16)>
@@ -200,6 +206,7 @@ module attributes { dlti.dl_spec = #dlti.dl_spec<
          // empty
          // CHECK: alignment = 4
          // CHECK: bitsize = 0
+         // CHECK: index = 0
          // CHECK: preferred = 4
          // CHECK: size = 0
          "test.data_layout_query"() : () -> !llvm.struct<()>
@@ -265,6 +272,7 @@ module {
         // simple case
         // CHECK: alignment = 4
         // CHECK: bitsize = 64
+        // CHECK: index = 0
         // CHECK: preferred = 4
         // CHECK: size = 8
         "test.data_layout_query"() : () -> !llvm.array<2 x i32>
@@ -272,6 +280,7 @@ module {
         // size 0
         // CHECK: alignment = 8
         // CHECK: bitsize = 0
+        // CHECK: index = 0
         // CHECK: preferred = 8
         // CHECK: size = 0
         "test.data_layout_query"() : () -> !llvm.array<0 x f64>
@@ -279,6 +288,7 @@ module {
         // alignment info matches element type
         // CHECK: alignment = 4
         // CHECK: bitsize = 64
+        // CHECK: index = 0
         // CHECK: preferred = 8
         // CHECK: size = 8
         "test.data_layout_query"() : () -> !llvm.array<1 x i64>
diff --git a/mlir/test/Dialect/Linalg/flatten-elementwise.mlir b/mlir/test/Dialect/Linalg/flatten-elementwise.mlir
index 858c133dd536..5a27fe76b134 100644
--- a/mlir/test/Dialect/Linalg/flatten-elementwise.mlir
+++ b/mlir/test/Dialect/Linalg/flatten-elementwise.mlir
@@ -67,6 +67,27 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// CHECK-LABEL: func.func @map_already_flat(
+// CHECK-SAME:                 %[[ARG0:[a-zA-Z0-9_]*]]: memref<32xf32>
+// CHECK-SAME:                 %[[ARG1:[a-zA-Z0-9_]*]]: memref<32xf32>
+// CHECK-SAME:                 %[[ARG2:[a-zA-Z0-9_]*]]: memref<32xf32>
+// CHECK-NEXT:    linalg.map { arith.addf } ins(%[[ARG0]], %[[ARG1]] : memref<32xf32>, memref<32xf32>) outs(%[[ARG2]] : memref<32xf32>)
+func.func @map_already_flat(%arg0: memref<32xf32>, %arg1: memref<32xf32>, %arg2: memref<32xf32>) {
+    linalg.map {arith.addf} ins(%arg0, %arg1: memref<32xf32>, memref<32xf32>) outs(%arg2: memref<32xf32>)
+    return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match interface{LinalgOp} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %flattened = transform.structured.flatten_elementwise %0
+      : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
 // CHECK: #[[$MAP0:.*]] = affine_map<(d0) -> (d0)>
 // CHECK-LABEL: func.func @generic
 // CHECK-SAME:                 %[[ARG0:[a-zA-Z0-9_]*]]: memref<32x7xf32>
diff --git a/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
index c69701b65e20..9616a3e32a06 100644
--- a/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
@@ -1,9 +1,9 @@
 // RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries" -canonicalize -buffer-loop-hoisting -drop-equivalent-buffer-results -split-input-file | FileCheck %s
 
 // Run fuzzer with different seeds.
-// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=23 bufferize-function-boundaries" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=59 bufferize-function-boundaries" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=91 bufferize-function-boundaries" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23 bufferize-function-boundaries" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=59 bufferize-function-boundaries" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=91 bufferize-function-boundaries" -split-input-file -o /dev/null
 
 // Test bufferization using memref types that have no layout map.
 // RUN: mlir-opt %s -one-shot-bufferize="unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries" -drop-equivalent-buffer-results -split-input-file | FileCheck %s --check-prefix=CHECK-NO-LAYOUT-MAP
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
index 7d23498f32e1..4d82021e86f5 100644
--- a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
@@ -1,9 +1,9 @@
 // RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only" -split-input-file | FileCheck %s
 
 // Run fuzzer with different seeds.
-// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only analysis-fuzzer-seed=23" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only analysis-fuzzer-seed=59" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only analysis-fuzzer-seed=91" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=59" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=91" -split-input-file -o /dev/null
 
 // CHECK-LABEL: func @scf_for_yield_only
 func.func @scf_for_yield_only(
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
index 24da8d84b18e..485fdd9b0e59 100644
--- a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
@@ -1,9 +1,9 @@
 // RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries" -cse -canonicalize -drop-equivalent-buffer-results -split-input-file | FileCheck %s
 
 // Run fuzzer with different seeds.
-// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs-from-loops test-analysis-only analysis-fuzzer-seed=23 bufferize-function-boundaries" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs-from-loops test-analysis-only analysis-fuzzer-seed=59 bufferize-function-boundaries" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs-from-loops test-analysis-only analysis-fuzzer-seed=91 bufferize-function-boundaries" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs-from-loops analysis-heuristic=fuzzer test-analysis-only analysis-fuzzer-seed=23 bufferize-function-boundaries" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs-from-loops analysis-heuristic=fuzzer test-analysis-only analysis-fuzzer-seed=59 bufferize-function-boundaries" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs-from-loops analysis-heuristic=fuzzer test-analysis-only analysis-fuzzer-seed=91 bufferize-function-boundaries" -split-input-file -o /dev/null
 
 // Test bufferization using memref types that have no layout map.
 // RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs-from-loops unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries" -split-input-file -o /dev/null
diff --git a/mlir/test/Dialect/Tensor/decompose-concat.mlir b/mlir/test/Dialect/Tensor/decompose-concat.mlir
index 159347b4f7aa..c0f23b8eddbd 100644
--- a/mlir/test/Dialect/Tensor/decompose-concat.mlir
+++ b/mlir/test/Dialect/Tensor/decompose-concat.mlir
@@ -73,7 +73,8 @@ func.func @decompose_dynamic_into_static_concat_dim(%arg0 : tensor<1x?x?xf32>,
 //       CHECK:     return %[[CONCAT]] : tensor<1x?x128xf32>
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.tensor.decompose_concat
     } : !transform.op<"func.func">
diff --git a/mlir/test/Dialect/Tensor/fold-empty-op.mlir b/mlir/test/Dialect/Tensor/fold-empty-op.mlir
index 057e105f3b57..15f841f2128e 100644
--- a/mlir/test/Dialect/Tensor/fold-empty-op.mlir
+++ b/mlir/test/Dialect/Tensor/fold-empty-op.mlir
@@ -1,7 +1,8 @@
 // RUN: mlir-opt -split-input-file -transform-interpreter %s | FileCheck %s
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.tensor.fold_tensor_empty
     } : !transform.op<"func.func">
@@ -67,7 +68,8 @@ func.func @rank_reducing_empty_tensor_extract(%sz : index, %idx : index) -> tens
 // -----
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.tensor.fold_tensor_empty
           {fold_single_use_only = true}
diff --git a/mlir/test/Dialect/Tensor/fold-tensor-subset-ops-into-vector-transfers.mlir b/mlir/test/Dialect/Tensor/fold-tensor-subset-ops-into-vector-transfers.mlir
index 505abc8f3533..6213db3956f9 100644
--- a/mlir/test/Dialect/Tensor/fold-tensor-subset-ops-into-vector-transfers.mlir
+++ b/mlir/test/Dialect/Tensor/fold-tensor-subset-ops-into-vector-transfers.mlir
@@ -1,7 +1,8 @@
 // RUN: mlir-opt -split-input-file -transform-interpreter %s | FileCheck %s
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.tensor.fold_tensor_subset_ops_into_vector_transfers
     } : !transform.op<"func.func">
diff --git a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
index 38c3bb8af810..e2169fe1404c 100644
--- a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
@@ -1,9 +1,9 @@
 // RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries" -drop-equivalent-buffer-results -split-input-file | FileCheck %s
 
 // Run fuzzer with different seeds.
-// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=23 bufferize-function-boundaries" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=59 bufferize-function-boundaries" -split-input-file -o /dev/null
-// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-fuzzer-seed=91 bufferize-function-boundaries" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23 bufferize-function-boundaries" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=59 bufferize-function-boundaries" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=91 bufferize-function-boundaries" -split-input-file -o /dev/null
 
 // Test bufferization using memref types that have no layout map.
 // RUN: mlir-opt %s -one-shot-bufferize="unknown-type-conversion=identity-layout-map bufferize-function-boundaries" -split-input-file -o /dev/null
diff --git a/mlir/test/Dialect/Tensor/rewrite-as-constant.mlir b/mlir/test/Dialect/Tensor/rewrite-as-constant.mlir
index d68a6bd25286..1a1cf9e407d8 100644
--- a/mlir/test/Dialect/Tensor/rewrite-as-constant.mlir
+++ b/mlir/test/Dialect/Tensor/rewrite-as-constant.mlir
@@ -1,7 +1,8 @@
 // RUN: mlir-opt -split-input-file -transform-interpreter %s | FileCheck %s
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.tensor.rewrite_as_constant
     } : !transform.op<"func.func">
diff --git a/mlir/test/Dialect/Tosa/constant-op-fold.mlir b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
index 27ca3ae3c21b..de752f31fcba 100644
--- a/mlir/test/Dialect/Tosa/constant-op-fold.mlir
+++ b/mlir/test/Dialect/Tosa/constant-op-fold.mlir
@@ -112,6 +112,23 @@ func.func @transpose_nofold_quantized_types() -> tensor<1x1x2x2x!quant.uniform<i
   return %0: tensor<1x1x2x2x!quant.uniform<i8<-127:127>:f32:3, {1.000000e-01,1.000000e-01}>>
 }
 
+// CHECK-LABEL: @transpose_nofold_dense_resource
+func.func @transpose_nofold_dense_resource() -> tensor<2x2xf32> {
+  %0 = "tosa.const"() <{value = dense_resource<resource> : tensor<2x2xf32>}> : () -> tensor<2x2xf32>
+  %1 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+
+  // CHECK: tosa.transpose
+  %2 = tosa.transpose %0, %1 : (tensor<2x2xf32>, tensor<2xi32>) -> tensor<2x2xf32>
+  return %2 : tensor<2x2xf32>
+}
+{-#
+  dialect_resources: {
+    builtin: {
+      resource: "0x08000000010000000000000002000000000000000300000000000000"
+    }
+  }
+#-}
+
 // -----
 
 // CHECK-LABEL: @fold_add_zero_rhs_f32
diff --git a/mlir/test/Dialect/Vector/test-scalable-bounds.mlir b/mlir/test/Dialect/Vector/test-scalable-bounds.mlir
new file mode 100644
index 000000000000..245a6f5c13ac
--- /dev/null
+++ b/mlir/test/Dialect/Vector/test-scalable-bounds.mlir
@@ -0,0 +1,161 @@
+// RUN: mlir-opt %s -test-affine-reify-value-bounds -cse -verify-diagnostics \
+// RUN:   -verify-diagnostics -split-input-file | FileCheck %s
+
+#map_dim_i = affine_map<(d0)[s0] -> (-d0 + 32400, s0)>
+#map_dim_j = affine_map<(d0)[s0] -> (-d0 + 16, s0)>
+
+// Here the upper bound for min_i is 4 x vscale, as we know 4 x vscale is
+// always less than 32400. The bound for min_j is 16, as 16 is always less
+// 4 x vscale_max (vscale_max is the UB for vscale).
+
+// CHECK: #[[$SCALABLE_BOUND_MAP_0:.*]] = affine_map<()[s0] -> (s0 * 4)>
+
+// CHECK-LABEL: @fixed_size_loop_nest
+//   CHECK-DAG:   %[[VSCALE:.*]] = vector.vscale
+//   CHECK-DAG:   %[[UB_i:.*]] = affine.apply #[[$SCALABLE_BOUND_MAP_0]]()[%[[VSCALE]]]
+//   CHECK-DAG:   %[[UB_j:.*]] = arith.constant 16 : index
+//       CHECK:   "test.some_use"(%[[UB_i]], %[[UB_j]]) : (index, index) -> ()
+func.func @fixed_size_loop_nest() {
+  %c16 = arith.constant 16 : index
+  %c32400 = arith.constant 32400 : index
+  %c4 = arith.constant 4 : index
+  %c0 = arith.constant 0 : index
+  %vscale = vector.vscale
+  %c4_vscale = arith.muli %vscale, %c4 : index
+  scf.for %i = %c0 to %c32400 step %c4_vscale {
+    %min_i = affine.min #map_dim_i(%i)[%c4_vscale]
+    scf.for %j = %c0 to %c16 step %c4_vscale {
+      %min_j = affine.min #map_dim_j(%j)[%c4_vscale]
+      %bound_i = "test.reify_scalable_bound"(%min_i) {type = "UB", vscale_min = 1, vscale_max = 16} : (index) -> index
+      %bound_j = "test.reify_scalable_bound"(%min_j) {type = "UB", vscale_min = 1, vscale_max = 16} : (index) -> index
+      "test.some_use"(%bound_i, %bound_j) : (index, index) -> ()
+    }
+  }
+  return
+}
+
+// -----
+
+#map_dynamic_dim = affine_map<(d0)[s0, s1] -> (-d0 + s1, s0)>
+
+// Here upper bounds for both min_i and min_j are both (conservatively)
+// 4 x vscale, as we know that is always the largest value they could take. As
+// if `dim < 4 x vscale` then 4 x vscale is an overestimate, and if
+// `dim > 4 x vscale` then the min will be clamped to 4 x vscale.
+
+// CHECK: #[[$SCALABLE_BOUND_MAP_1:.*]] = affine_map<()[s0] -> (s0 * 4)>
+
+// CHECK-LABEL: @dynamic_size_loop_nest
+//       CHECK:   %[[VSCALE:.*]] = vector.vscale
+//       CHECK:   %[[UB_ij:.*]] = affine.apply #[[$SCALABLE_BOUND_MAP_1]]()[%[[VSCALE]]]
+//       CHECK:   "test.some_use"(%[[UB_ij]], %[[UB_ij]]) : (index, index) -> ()
+func.func @dynamic_size_loop_nest(%dim0: index, %dim1: index) {
+  %c4 = arith.constant 4 : index
+  %c0 = arith.constant 0 : index
+  %vscale = vector.vscale
+  %c4_vscale = arith.muli %vscale, %c4 : index
+  scf.for %i = %c0 to %dim0 step %c4_vscale {
+    %min_i = affine.min #map_dynamic_dim(%i)[%c4_vscale, %dim0]
+    scf.for %j = %c0 to %dim1 step %c4_vscale {
+      %min_j = affine.min #map_dynamic_dim(%j)[%c4_vscale, %dim1]
+      %bound_i = "test.reify_scalable_bound"(%min_i) {type = "UB", vscale_min = 1, vscale_max = 16} : (index) -> index
+      %bound_j = "test.reify_scalable_bound"(%min_j) {type = "UB", vscale_min = 1, vscale_max = 16} : (index) -> index
+      "test.some_use"(%bound_i, %bound_j) : (index, index) -> ()
+    }
+  }
+  return
+}
+
+// -----
+
+// Here the bound is just a value + a constant.
+
+// CHECK: #[[$SCALABLE_BOUND_MAP_2:.*]] = affine_map<()[s0] -> (s0 + 8)>
+
+// CHECK-LABEL: @add_to_vscale
+//       CHECK:   %[[VSCALE:.*]] = vector.vscale
+//       CHECK:   %[[SCALABLE_BOUND:.*]] = affine.apply #[[$SCALABLE_BOUND_MAP_2]]()[%[[VSCALE]]]
+//       CHECK:   "test.some_use"(%[[SCALABLE_BOUND]]) : (index) -> ()
+func.func @add_to_vscale() {
+  %vscale = vector.vscale
+  %c8 = arith.constant 8 : index
+  %vscale_plus_c8 = arith.addi %vscale, %c8 : index
+  %bound = "test.reify_scalable_bound"(%vscale_plus_c8) {type = "EQ", vscale_min = 1, vscale_max = 16} : (index) -> index
+  "test.some_use"(%bound) : (index) -> ()
+  return
+}
+
+// -----
+
+// Here we know vscale is always 2 so we get a constant bound.
+
+// CHECK-LABEL: @vscale_fixed_size
+//       CHECK:   %[[C2:.*]] = arith.constant 2 : index
+//       CHECK:   "test.some_use"(%[[C2]]) : (index) -> ()
+func.func @vscale_fixed_size() {
+  %vscale = vector.vscale
+  %bound = "test.reify_scalable_bound"(%vscale) {type = "EQ", vscale_min = 2, vscale_max = 2} : (index) -> index
+  "test.some_use"(%bound) : (index) -> ()
+  return
+}
+
+// -----
+
+// Here we don't know the upper bound (%a is underspecified)
+
+func.func @unknown_bound(%a: index) {
+  %vscale = vector.vscale
+  %vscale_plus_a = arith.muli %vscale, %a : index
+  // expected-error @below{{could not reify bound}}
+  %bound = "test.reify_scalable_bound"(%vscale_plus_a) {type = "UB", vscale_min = 1, vscale_max = 16} : (index) -> index
+  "test.some_use"(%bound) : (index) -> ()
+  return
+}
+
+// -----
+
+// Here we have two vscale values (that have not been CSE'd), but they should
+// still be treated as equivalent.
+
+// CHECK: #[[$SCALABLE_BOUND_MAP_3:.*]] = affine_map<()[s0] -> (s0 * 6)>
+
+// CHECK-LABEL: @duplicate_vscale_values
+//       CHECK:   %[[VSCALE:.*]] = vector.vscale
+//       CHECK:   %[[SCALABLE_BOUND:.*]] = affine.apply #[[$SCALABLE_BOUND_MAP_3]]()[%[[VSCALE]]]
+//       CHECK:   "test.some_use"(%[[SCALABLE_BOUND]]) : (index) -> ()
+func.func @duplicate_vscale_values() {
+  %c4 = arith.constant 4 : index
+  %vscale_0 = vector.vscale
+
+  %c2 = arith.constant 2 : index
+  %vscale_1 = vector.vscale
+
+  %c4_vscale = arith.muli %vscale_0, %c4 : index
+  %c2_vscale = arith.muli %vscale_1, %c2 : index
+  %add = arith.addi %c2_vscale, %c4_vscale : index
+
+  %bound = "test.reify_scalable_bound"(%add) {type = "EQ", vscale_min = 1, vscale_max = 16} : (index) -> index
+  "test.some_use"(%bound) : (index) -> ()
+  return
+}
+
+// -----
+
+// Test some non-scalable code to ensure that works too:
+
+#map_dim_i = affine_map<(d0)[s0] -> (-d0 + 1024, s0)>
+
+// CHECK-LABEL: @non_scalable_code
+//       CHECK:   %[[C4:.*]] = arith.constant 4 : index
+//       CHECK:   "test.some_use"(%[[C4]]) : (index) -> ()
+func.func @non_scalable_code() {
+  %c1024 = arith.constant 1024 : index
+  %c4 = arith.constant 4 : index
+  %c0 = arith.constant 0 : index
+  scf.for %i = %c0 to %c1024 step %c4 {
+    %min_i = affine.min #map_dim_i(%i)[%c4]
+    %bound_i = "test.reify_scalable_bound"(%min_i) {type = "UB", vscale_min = 1, vscale_max = 16} : (index) -> index
+    "test.some_use"(%bound_i) : (index) -> ()
+  }
+  return
+}
diff --git a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-matvec-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-matvec-transforms.mlir
index 412e95bede3a..0b3636e3b196 100644
--- a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-matvec-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-matvec-transforms.mlir
@@ -657,7 +657,8 @@ func.func @masked_extract_contract2_scalable_reduction_dim(%arg0: vector<[2]x[3]
 //  TD sequence
 // ============================================================================
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
     } : !transform.op<"func.func">
diff --git a/mlir/test/Dialect/Vector/vector-materialize-mask.mlir b/mlir/test/Dialect/Vector/vector-materialize-mask.mlir
index c47d91bb6ed9..a3fd6339492c 100644
--- a/mlir/test/Dialect/Vector/vector-materialize-mask.mlir
+++ b/mlir/test/Dialect/Vector/vector-materialize-mask.mlir
@@ -8,7 +8,8 @@ func.func @select_single_i1_vector(%cond : i1) -> vector<1xi1> {
 }
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.materialize_masks
     } : !transform.op<"func.func">
diff --git a/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir b/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir
index 6e06ba1bb14b..22808aa7d6ac 100644
--- a/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir
+++ b/mlir/test/Dialect/Vector/vector-multi-reduction-lowering.mlir
@@ -282,7 +282,8 @@ func.func private @scalable_dims(%A : vector<8x[4]x2xf32>, %B: vector<8x[4]xf32>
 // CHECK:           return %[[VAL_163]] : vector<8x[4]xf32>
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.lower_multi_reduction lowering_strategy = "innerreduction"
     } : !transform.op<"func.func">
diff --git a/mlir/test/Dialect/Vector/vector-multi-reduction-outer-lowering.mlir b/mlir/test/Dialect/Vector/vector-multi-reduction-outer-lowering.mlir
index 308baa97af9a..33adb5545647 100644
--- a/mlir/test/Dialect/Vector/vector-multi-reduction-outer-lowering.mlir
+++ b/mlir/test/Dialect/Vector/vector-multi-reduction-outer-lowering.mlir
@@ -189,7 +189,8 @@ func.func @vector_multi_reduction_to_scalar(%arg0: vector<2x3xf32>, %acc: f32) -
 //       CHECK:   return %{{.+}}
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.lower_multi_reduction lowering_strategy = "innerparallel"
     } : !transform.op<"func.func">
diff --git a/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir b/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir
index d65708068862..e9d12b044e2c 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-drop-unit-dims-patterns.mlir
@@ -239,7 +239,8 @@ func.func @masked_transfer_read_dynamic_rank_reducing_scalable_unit_dim(
 //       CHECK: vector.transfer_read {{.*}} vector<[16]x[1]xi8>
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.rank_reducing_subview_patterns
     } : !transform.op<"func.func">
diff --git a/mlir/test/Dialect/Vector/vector-transfer-full-partial-split-copy-transform.mlir b/mlir/test/Dialect/Vector/vector-transfer-full-partial-split-copy-transform.mlir
index bcb8e1a10c84..483147c6f6a4 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-full-partial-split-copy-transform.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-full-partial-split-copy-transform.mlir
@@ -63,7 +63,7 @@ func.func @split_vector_transfer_read_strided_2d(
   %c0 = arith.constant 0 : index
   %f0 = arith.constant 0.0 : f32
 
-  
+
   //  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
   //  CHECK-DAG: %[[c4:.*]] = arith.constant 4 : index
   //  CHECK-DAG: %[[c7:.*]] = arith.constant 7 : index
@@ -107,7 +107,8 @@ func.func @split_vector_transfer_read_strided_2d(
 }
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.split_transfer_full_partial split_transfer_strategy = "linalg-copy"
     } : !transform.op<"func.func">
@@ -170,7 +171,8 @@ func.func @split_vector_transfer_write_2d(%V: vector<4x8xf32>, %A: memref<?x8xf3
 // CHECK:         }
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.split_transfer_full_partial split_transfer_strategy = "linalg-copy"
     } : !transform.op<"func.func">
@@ -240,7 +242,8 @@ func.func @split_vector_transfer_write_strided_2d(
 // CHECK:         }
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.split_transfer_full_partial split_transfer_strategy = "linalg-copy"
     } : !transform.op<"func.func">
diff --git a/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir b/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir
index 644de885bbaa..a9c7bf8e8b32 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir
@@ -133,7 +133,8 @@ func.func @split_vector_transfer_read_mem_space(%A: memref<?x8xf32, 3>, %i: inde
 }
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.split_transfer_full_partial split_transfer_strategy = "vector-transfer"
     } : !transform.op<"func.func">
@@ -193,7 +194,8 @@ func.func @split_vector_transfer_write_2d(%V: vector<4x8xf32>, %A: memref<?x8xf3
 
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.split_transfer_full_partial split_transfer_strategy = "vector-transfer"
     } : !transform.op<"func.func">
@@ -257,7 +259,8 @@ func.func @split_vector_transfer_write_strided_2d(
 // CHECK:         }
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.split_transfer_full_partial split_transfer_strategy = "vector-transfer"
     } : !transform.op<"func.func">
@@ -292,7 +295,8 @@ func.func @split_vector_transfer_write_mem_space(%V: vector<4x8xf32>, %A: memref
 
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.split_transfer_full_partial split_transfer_strategy = "vector-transfer"
     } : !transform.op<"func.func">
@@ -337,7 +341,8 @@ func.func @transfer_read_within_scf_for(%A : memref<?x?xf32>, %lb : index, %ub :
 }
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.split_transfer_full_partial split_transfer_strategy = "vector-transfer"
     } : !transform.op<"func.func">
diff --git a/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir b/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir
index 7aaaff70e524..2f2bdcaab5b3 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-to-vector-load-store.mlir
@@ -239,7 +239,8 @@ func.func @transfer_broadcasting_complex(%mem : memref<10x20x30x8x8xf32>, %i : i
 
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.lower_transfer max_transfer_rank = 99
       transform.apply_patterns.vector.transfer_permutation_patterns
@@ -363,7 +364,8 @@ func.func @transfer_write_broadcast_unit_dim(
 }
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.lower_transfer max_transfer_rank = 99
       transform.apply_patterns.vector.transfer_permutation_patterns
@@ -391,7 +393,8 @@ func.func @transfer_2D_masked(%mem : memref<?x?xf32>, %mask : vector<2x4xi1>) ->
 }
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.lower_transfer max_transfer_rank = 2
     } : !transform.op<"func.func">
diff --git a/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir b/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir
index 97b698edeb05..628a8ce50959 100644
--- a/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir
+++ b/mlir/test/Dialect/Vector/vector-transpose-lowering.mlir
@@ -86,7 +86,8 @@ func.func @transpose23_scalable(%arg0: vector<2x[3]xf32>) -> vector<[3]x2xf32> {
 }
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.lower_transpose lowering_strategy = "eltwise"
     } : !transform.op<"func.func">
@@ -111,7 +112,8 @@ func.func @transpose(%arg0: vector<2x4xf32>) -> vector<4x2xf32> {
 
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.lower_transpose lowering_strategy = "shuffle_1d"
     } : !transform.op<"func.func">
@@ -132,7 +134,8 @@ func.func @transpose(%arg0: vector<2x4xf32>) -> vector<4x2xf32> {
 
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.lower_transpose lowering_strategy = "flat_transpose"
     } : !transform.op<"func.func">
@@ -621,7 +624,8 @@ func.func @transpose210_1x8x8xf32(%arg0: vector<1x8x8xf32>) -> vector<8x8x1xf32>
 }
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.lower_transpose avx2_lowering_strategy = true
     } : !transform.op<"func.func">
@@ -701,7 +705,8 @@ func.func @transpose_shuffle16x16xf32(%arg0: vector<16x16xf32>) -> vector<16x16x
 }
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.lower_transpose lowering_strategy = "shuffle_16x16"
     } : !transform.op<"func.func">
@@ -782,7 +787,8 @@ func.func @transpose021_shuffle16x16xf32(%arg0: vector<1x16x16xf32>) -> vector<1
 }
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.lower_transpose lowering_strategy = "shuffle_16x16"
     } : !transform.op<"func.func">
@@ -842,7 +848,8 @@ func.func @transpose10_nx4xnx1xf32(%arg0: vector<4x[1]xf32>) -> vector<[1]x4xf32
 }
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.lower_transpose
     } : !transform.op<"func.func">
@@ -863,7 +870,8 @@ func.func @transpose_nx8x2xf32(%arg0: vector<[8]x2xf32>) -> vector<2x[8]xf32> {
 }
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.lower_transpose lowering_strategy = "shuffle_1d"
     } : !transform.op<"func.func">
diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
new file mode 100644
index 000000000000..039346adbb85
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
@@ -0,0 +1,62 @@
+// RUN: mlir-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: mlir-opt -mlir-print-op-generic %s | mlir-opt | FileCheck %s
+
+// CHECK-LABEL: gpu.module @test {
+gpu.module @test {
+// CHECK: gpu.func @test_create_nd_tdesc_vc_1(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_create_nd_tdesc_vc_1(%src: memref<24x32xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+  gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_vc_2(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) {
+gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) {
+  //CHECK: %[[C:.*]] = arith.constant 1 : index
+  %c1 = arith.constant 1 : index
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+  gpu.return
+}
+
+// CHECK: gpu.func @test_create_nd_tdesc_vc_3(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr<array_length = 2 : i64>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr<array_length = 2>>
+  gpu.return
+}
+
+// CHECK: gpu.func @test_prefetch_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK: xegpu.prefetch_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<8x16xf16>
+  xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<8x16xf16>
+  gpu.return
+}
+
+// CHECK: func @test_load_nd_vc(%[[arg0:.*]]: memref<8x16xf16>) {
+gpu.func @test_load_nd_vc(%src: memref<8x16xf16>) {
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, vnni_axis = 0 : i64}> : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
+  %2 = xegpu.load_nd %1 <{vnni_axis = 0, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+       : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
+  gpu.return
+}
+
+// CHECK: func @test_store_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
+gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
+  // CHECK: %[[C:.*]] = arith.constant dense<1.000000e+00> : vector<24x32xf16>
+  %1 = arith.constant dense<1.0>: vector<24x32xf16>
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+  %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+  // CHECK: xegpu.store_nd %[[C]], %[[R0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}> : vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
+  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>: vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
+  gpu.return
+}
+
+}
+\ No newline at end of file
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack.mlir
index 34d450c2403f..7ecccad212cd 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack.mlir
@@ -285,8 +285,7 @@ module {
     %has_runtime = sparse_tensor.has_runtime_library
     scf.if %has_runtime {
       // sparse_tensor.assemble copies buffers when running with the runtime
-      // library. Deallocations are needed not needed when running in codgen
-      // mode.
+      // library. Deallocations are not needed when running in codegen mode.
       bufferization.dealloc_tensor %s4 : tensor<10x10xf64, #SortedCOO>
       bufferization.dealloc_tensor %s5 : tensor<10x10xf64, #SortedCOOI32>
       bufferization.dealloc_tensor %csr : tensor<2x2xf64, #CSR>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_d.mlir
index fe8836266a47..20ae7e86285c 100755
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pack_d.mlir
@@ -146,8 +146,7 @@ module {
     %has_runtime = sparse_tensor.has_runtime_library
     scf.if %has_runtime {
       // sparse_tensor.assemble copies buffers when running with the runtime
-      // library. Deallocations are needed not needed when running in codgen
-      // mode.
+      // library. Deallocations are not needed when running in codegen mode.
       bufferization.dealloc_tensor %s0 : tensor<4x3x2xf32, #CCC>
       bufferization.dealloc_tensor %s1 : tensor<4x3x2xf32, #BatchedCSR>
       bufferization.dealloc_tensor %s2 : tensor<4x3x2xf32, #CSRDense>
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-shuffle16x16.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-shuffle16x16.mlir
index 396417bd9b44..f7f0a7267cd0 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-shuffle16x16.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-shuffle16x16.mlir
@@ -30,7 +30,8 @@ func.func @entry() {
 }
 
 module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%func_op: !transform.op<"func.func"> {transform.readonly}) {
+  transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func">
     transform.apply_patterns to %func_op {
       transform.apply_patterns.vector.lower_transpose lowering_strategy = "shuffle_16x16"
     } : !transform.op<"func.func">
diff --git a/mlir/test/Interfaces/DataLayoutInterfaces/module.mlir b/mlir/test/Interfaces/DataLayoutInterfaces/module.mlir
index 096e7ceb3cbc..97286ce75806 100644
--- a/mlir/test/Interfaces/DataLayoutInterfaces/module.mlir
+++ b/mlir/test/Interfaces/DataLayoutInterfaces/module.mlir
@@ -2,11 +2,13 @@
 
 module attributes { dlti.dl_spec = #dlti.dl_spec<
       #dlti.dl_entry<!test.test_type_with_layout<10>, ["size", 12]>,
-      #dlti.dl_entry<!test.test_type_with_layout<20>, ["alignment", 32]>>} {
+      #dlti.dl_entry<!test.test_type_with_layout<20>, ["alignment", 32]>,
+      #dlti.dl_entry<!test.test_type_with_layout<30>, ["index", 7]>>} {
   // CHECK-LABEL: @module_level_layout
   func.func @module_level_layout() {
      // CHECK: alignment = 32
      // CHECK: bitsize = 12
+     // CHECK: index = 7
      // CHECK: preferred = 1
      // CHECK: size = 2
     "test.data_layout_query"() : () -> !test.test_type_with_layout<10>
diff --git a/mlir/test/Interfaces/DataLayoutInterfaces/query.mlir b/mlir/test/Interfaces/DataLayoutInterfaces/query.mlir
index 9f9240ac6f8c..d3bc91339d16 100644
--- a/mlir/test/Interfaces/DataLayoutInterfaces/query.mlir
+++ b/mlir/test/Interfaces/DataLayoutInterfaces/query.mlir
@@ -4,24 +4,34 @@
 func.func @no_layout_builtin() {
   // CHECK: alignment = 4
   // CHECK: bitsize = 32
+  // CHECK: index = 0
   // CHECK: preferred = 4
   // CHECK: size = 4
   "test.data_layout_query"() : () -> i32
   // CHECK: alignment = 8
   // CHECK: bitsize = 64
+  // CHECK: index = 0
   // CHECK: preferred = 8
   // CHECK: size = 8
   "test.data_layout_query"() : () -> f64
   // CHECK: alignment = 4
   // CHECK: bitsize = 64
+  // CHECK: index = 0
   // CHECK: preferred = 4
   // CHECK: size = 8
   "test.data_layout_query"() : () -> complex<f32>
   // CHECK: alignment = 1
   // CHECK: bitsize = 14
+  // CHECK: index = 0
   // CHECK: preferred = 1
   // CHECK: size = 2
   "test.data_layout_query"() : () -> complex<i6>
+  // CHECK: alignment = 4
+  // CHECK: bitsize = 64
+  // CHECK: index = 64
+  // CHECK: preferred = 8
+  // CHECK: size = 8
+  "test.data_layout_query"() : () -> index
   return
 
 }
@@ -30,6 +40,7 @@ func.func @no_layout_builtin() {
 func.func @no_layout_custom() {
   // CHECK: alignment = 1
   // CHECK: bitsize = 1
+  // CHECK: index = 1
   // CHECK: preferred = 1
   // CHECK: size = 1
   "test.data_layout_query"() : () -> !test.test_type_with_layout<10>
@@ -41,6 +52,7 @@ func.func @layout_op_no_layout() {
   "test.op_with_data_layout"() ({
     // CHECK: alignment = 1
     // CHECK: bitsize = 1
+    // CHECK: index = 1
     // CHECK: preferred = 1
     // CHECK: size = 1
     "test.data_layout_query"() : () -> !test.test_type_with_layout<1000>
@@ -54,13 +66,15 @@ func.func @layout_op() {
   "test.op_with_data_layout"() ({
     // CHECK: alignment = 20
     // CHECK: bitsize = 10
+    // CHECK: index = 30
     // CHECK: preferred = 1
     // CHECK: size = 2
     "test.data_layout_query"() : () -> !test.test_type_with_layout<10>
     "test.maybe_terminator"() : () -> ()
   }) { dlti.dl_spec = #dlti.dl_spec<
       #dlti.dl_entry<!test.test_type_with_layout<10>, ["size", 10]>,
-      #dlti.dl_entry<!test.test_type_with_layout<20>, ["alignment", 20]>
+      #dlti.dl_entry<!test.test_type_with_layout<20>, ["alignment", 20]>,
+      #dlti.dl_entry<!test.test_type_with_layout<30>, ["index", 30]>
   >} : () -> ()
   return
 }
@@ -72,13 +86,15 @@ func.func @nested_inner_only() {
     "test.op_with_data_layout"() ({
       // CHECK: alignment = 20
       // CHECK: bitsize = 10
+      // CHECK: index = 30
       // CHECK: preferred = 1
       // CHECK: size = 2
       "test.data_layout_query"() : () -> !test.test_type_with_layout<10>
       "test.maybe_terminator"() : () -> ()
     }) { dlti.dl_spec = #dlti.dl_spec<
         #dlti.dl_entry<!test.test_type_with_layout<10>, ["size", 10]>,
-        #dlti.dl_entry<!test.test_type_with_layout<20>, ["alignment", 20]>
+        #dlti.dl_entry<!test.test_type_with_layout<20>, ["alignment", 20]>,
+        #dlti.dl_entry<!test.test_type_with_layout<30>, ["index", 30]>
     >} : () -> ()
     "test.maybe_terminator"() : () -> ()
   }) : () -> ()
@@ -92,6 +108,7 @@ func.func @nested_outer_only() {
     "test.op_with_data_layout"() ({
       // CHECK: alignment = 20
       // CHECK: bitsize = 10
+      // CHECK: index = 30
       // CHECK: preferred = 1
       // CHECK: size = 2
       "test.data_layout_query"() : () -> !test.test_type_with_layout<10>
@@ -100,7 +117,8 @@ func.func @nested_outer_only() {
     "test.maybe_terminator"() : () -> ()
   }) { dlti.dl_spec = #dlti.dl_spec<
       #dlti.dl_entry<!test.test_type_with_layout<10>, ["size", 10]>,
-      #dlti.dl_entry<!test.test_type_with_layout<20>, ["alignment", 20]>
+      #dlti.dl_entry<!test.test_type_with_layout<20>, ["alignment", 20]>,
+      #dlti.dl_entry<!test.test_type_with_layout<30>, ["index", 30]>
     >} : () -> ()
   return
 }
@@ -112,6 +130,7 @@ func.func @nested_middle_only() {
       "test.op_with_data_layout"() ({
         // CHECK: alignment = 20
         // CHECK: bitsize = 10
+        // CHECK: index = 30
         // CHECK: preferred = 1
         // CHECK: size = 2
         "test.data_layout_query"() : () -> !test.test_type_with_layout<10>
@@ -120,7 +139,8 @@ func.func @nested_middle_only() {
     "test.maybe_terminator"() : () -> ()
     }) { dlti.dl_spec = #dlti.dl_spec<
         #dlti.dl_entry<!test.test_type_with_layout<10>, ["size", 10]>,
-        #dlti.dl_entry<!test.test_type_with_layout<20>, ["alignment", 20]>
+        #dlti.dl_entry<!test.test_type_with_layout<20>, ["alignment", 20]>,
+        #dlti.dl_entry<!test.test_type_with_layout<30>, ["index", 30]>
       >} : () -> ()
     "test.maybe_terminator"() : () -> ()
   }) : () -> ()
@@ -134,6 +154,7 @@ func.func @nested_combine_with_missing() {
       "test.op_with_data_layout"() ({
         // CHECK: alignment = 20
         // CHECK: bitsize = 10
+        // CHECK: index = 21
         // CHECK: preferred = 30
         // CHECK: size = 2
         "test.data_layout_query"() : () -> !test.test_type_with_layout<10>
@@ -146,13 +167,15 @@ func.func @nested_combine_with_missing() {
       >} : () -> ()
     // CHECK: alignment = 1
     // CHECK: bitsize = 42
+    // CHECK: index = 21
     // CHECK: preferred = 30
     // CHECK: size = 6
     "test.data_layout_query"() : () -> !test.test_type_with_layout<10>
     "test.maybe_terminator"() : () -> ()
   }) { dlti.dl_spec = #dlti.dl_spec<
       #dlti.dl_entry<!test.test_type_with_layout<10>, ["size", 42]>,
-      #dlti.dl_entry<!test.test_type_with_layout<30>, ["preferred", 30]>
+      #dlti.dl_entry<!test.test_type_with_layout<30>, ["preferred", 30]>,
+      #dlti.dl_entry<!test.test_type_with_layout<40>, ["index", 21]>
   >}: () -> ()
   return
 }
@@ -164,6 +187,7 @@ func.func @nested_combine_all() {
       "test.op_with_data_layout"() ({
         // CHECK: alignment = 20
         // CHECK: bitsize = 3
+        // CHECK: index = 40
         // CHECK: preferred = 30
         // CHECK: size = 1
         "test.data_layout_query"() : () -> !test.test_type_with_layout<10>
@@ -174,16 +198,19 @@ func.func @nested_combine_all() {
         >} : () -> ()
       // CHECK: alignment = 20
       // CHECK: bitsize = 10
+      // CHECK: index = 40
       // CHECK: preferred = 30
       // CHECK: size = 2
       "test.data_layout_query"() : () -> !test.test_type_with_layout<10>
       "test.maybe_terminator"() : () -> ()
     }) { dlti.dl_spec = #dlti.dl_spec<
         #dlti.dl_entry<!test.test_type_with_layout<10>, ["size", 10]>,
-        #dlti.dl_entry<!test.test_type_with_layout<20>, ["alignment", 20]>
+        #dlti.dl_entry<!test.test_type_with_layout<20>, ["alignment", 20]>,
+        #dlti.dl_entry<!test.test_type_with_layout<40>, ["index", 40]>
       >} : () -> ()
     // CHECK: alignment = 1
     // CHECK: bitsize = 42
+    // CHECK: index = 1
     // CHECK: preferred = 30
     // CHECK: size = 6
     "test.data_layout_query"() : () -> !test.test_type_with_layout<10>
@@ -200,18 +227,22 @@ func.func @integers() {
   "test.op_with_data_layout"() ({
     // CHECK: alignment = 8
     // CHECK: bitsize = 32
+    // CHECK: index = 0
     // CHECK: preferred = 8
     "test.data_layout_query"() : () -> i32
     // CHECK: alignment = 16
     // CHECK: bitsize = 56
+    // CHECK: index = 0
     // CHECK: preferred = 16
     "test.data_layout_query"() : () -> i56
     // CHECK: alignment = 16
     // CHECK: bitsize = 64
+    // CHECK: index = 0
     // CHECK: preferred = 16
     "test.data_layout_query"() : () -> i64
     // CHECK: alignment = 16
     // CHECK: bitsize = 128
+    // CHECK: index = 0
     // CHECK: preferred = 16
     "test.data_layout_query"() : () -> i128
     "test.maybe_terminator"() : () -> ()
@@ -222,18 +253,22 @@ func.func @integers() {
   "test.op_with_data_layout"() ({
     // CHECK: alignment = 8
     // CHECK: bitsize = 32
+    // CHECK: index = 0
     // CHECK: preferred = 16
     "test.data_layout_query"() : () -> i32
     // CHECK: alignment = 16
     // CHECK: bitsize = 56
+    // CHECK: index = 0
     // CHECK: preferred = 32
     "test.data_layout_query"() : () -> i56
     // CHECK: alignment = 16
     // CHECK: bitsize = 64
+    // CHECK: index = 0
     // CHECK: preferred = 32
     "test.data_layout_query"() : () -> i64
     // CHECK: alignment = 16
     // CHECK: bitsize = 128
+    // CHECK: index = 0
     // CHECK: preferred = 32
     "test.data_layout_query"() : () -> i128
     "test.maybe_terminator"() : () -> ()
@@ -248,10 +283,12 @@ func.func @floats() {
   "test.op_with_data_layout"() ({
     // CHECK: alignment = 8
     // CHECK: bitsize = 32
+    // CHECK: index = 0
     // CHECK: preferred = 8
     "test.data_layout_query"() : () -> f32
     // CHECK: alignment = 16
     // CHECK: bitsize = 80
+    // CHECK: index = 0
     // CHECK: preferred = 16
     "test.data_layout_query"() : () -> f80
     "test.maybe_terminator"() : () -> ()
@@ -262,10 +299,12 @@ func.func @floats() {
   "test.op_with_data_layout"() ({
     // CHECK: alignment = 8
     // CHECK: bitsize = 32
+    // CHECK: index = 0
     // CHECK: preferred = 16
     "test.data_layout_query"() : () -> f32
     // CHECK: alignment = 16
     // CHECK: bitsize = 80
+    // CHECK: index = 0
     // CHECK: preferred = 32
     "test.data_layout_query"() : () -> f80
     "test.maybe_terminator"() : () -> ()
diff --git a/mlir/test/Interfaces/DataLayoutInterfaces/types.mlir b/mlir/test/Interfaces/DataLayoutInterfaces/types.mlir
index 55bb1d2eac91..82ae02cf92ad 100644
--- a/mlir/test/Interfaces/DataLayoutInterfaces/types.mlir
+++ b/mlir/test/Interfaces/DataLayoutInterfaces/types.mlir
@@ -40,6 +40,7 @@ module @index attributes { dlti.dl_spec = #dlti.dl_spec<
   #dlti.dl_entry<index, 32>>} {
   func.func @query() {
     // CHECK: bitsize = 32
+    // CHECK: index = 32
     "test.data_layout_query"() : () -> index
     return
   }
diff --git a/mlir/test/Target/LLVMIR/data-layout.mlir b/mlir/test/Target/LLVMIR/data-layout.mlir
index e61972a0dd97..881d6727e2a1 100644
--- a/mlir/test/Target/LLVMIR/data-layout.mlir
+++ b/mlir/test/Target/LLVMIR/data-layout.mlir
@@ -6,7 +6,7 @@
 // CHECK: S128-
 // CHECK: i64:64:128
 // CHECK: f80:128:256
-// CHECK: p0:32:64:128
+// CHECK: p0:32:64:128:32
 // CHECK: p1:32:32:32:16
 module attributes {dlti.dl_spec = #dlti.dl_spec<
 #dlti.dl_entry<"dlti.endianness", "big">,
diff --git a/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp b/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp
index 39671a930f2e..5e160b720db6 100644
--- a/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp
@@ -13,6 +13,7 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Vector/IR/ScalableValueBoundsConstraintSet.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Pass/Pass.h"
@@ -75,7 +76,8 @@ static LogicalResult testReifyValueBounds(func::FuncOp funcOp,
   WalkResult result = funcOp.walk([&](Operation *op) {
     // Look for test.reify_bound ops.
     if (op->getName().getStringRef() == "test.reify_bound" ||
-        op->getName().getStringRef() == "test.reify_constant_bound") {
+        op->getName().getStringRef() == "test.reify_constant_bound" ||
+        op->getName().getStringRef() == "test.reify_scalable_bound") {
       if (op->getNumOperands() != 1 || op->getNumResults() != 1 ||
           !op->getResultTypes()[0].isIndex()) {
         op->emitOpError("invalid op");
@@ -110,6 +112,9 @@ static LogicalResult testReifyValueBounds(func::FuncOp funcOp,
       bool constant =
           op->getName().getStringRef() == "test.reify_constant_bound";
 
+      bool scalable = !constant && op->getName().getStringRef() ==
+                                       "test.reify_scalable_bound";
+
       // Prepare stop condition. By default, reify in terms of the op's
       // operands. No stop condition is used when a constant was requested.
       std::function<bool(Value, std::optional<int64_t>)> stopCondition =
@@ -137,6 +142,37 @@ static LogicalResult testReifyValueBounds(func::FuncOp funcOp,
         if (succeeded(reifiedConst))
           reified =
               FailureOr<OpFoldResult>(rewriter.getIndexAttr(*reifiedConst));
+      } else if (scalable) {
+        unsigned vscaleMin = 0;
+        unsigned vscaleMax = 0;
+        if (auto attr = "vscale_min"; op->hasAttrOfType<IntegerAttr>(attr)) {
+          vscaleMin = unsigned(op->getAttrOfType<IntegerAttr>(attr).getInt());
+        } else {
+          op->emitOpError("expected `vscale_min` to be provided");
+          return WalkResult::skip();
+        }
+        if (auto attr = "vscale_max"; op->hasAttrOfType<IntegerAttr>(attr)) {
+          vscaleMax = unsigned(op->getAttrOfType<IntegerAttr>(attr).getInt());
+        } else {
+          op->emitOpError("expected `vscale_max` to be provided");
+          return WalkResult::skip();
+        }
+
+        auto loc = op->getLoc();
+        auto reifiedScalable =
+            vector::ScalableValueBoundsConstraintSet::computeScalableBound(
+                value, dim, vscaleMin, vscaleMax, *boundType);
+        if (succeeded(reifiedScalable)) {
+          SmallVector<std::pair<Value, std::optional<int64_t>>, 1>
+              vscaleOperand;
+          if (reifiedScalable->map.getNumInputs() == 1) {
+            // The only possible input to the bound is vscale.
+            vscaleOperand.push_back(std::make_pair(
+                rewriter.create<vector::VectorScaleOp>(loc), std::nullopt));
+          }
+          reified = affine::materializeComputedBound(
+              rewriter, loc, reifiedScalable->map, vscaleOperand);
+        }
       } else {
         if (dim) {
           if (useArithOps) {
diff --git a/mlir/test/lib/Dialect/DLTI/TestDataLayoutQuery.cpp b/mlir/test/lib/Dialect/DLTI/TestDataLayoutQuery.cpp
index 740562e77830..3da48ffa403e 100644
--- a/mlir/test/lib/Dialect/DLTI/TestDataLayoutQuery.cpp
+++ b/mlir/test/lib/Dialect/DLTI/TestDataLayoutQuery.cpp
@@ -36,19 +36,21 @@ struct TestDataLayoutQuery
         return;
 
       const DataLayout &layout = layouts.getAbove(op);
-      unsigned size = layout.getTypeSize(op.getType());
-      unsigned bitsize = layout.getTypeSizeInBits(op.getType());
-      unsigned alignment = layout.getTypeABIAlignment(op.getType());
-      unsigned preferred = layout.getTypePreferredAlignment(op.getType());
+      llvm::TypeSize size = layout.getTypeSize(op.getType());
+      llvm::TypeSize bitsize = layout.getTypeSizeInBits(op.getType());
+      uint64_t alignment = layout.getTypeABIAlignment(op.getType());
+      uint64_t preferred = layout.getTypePreferredAlignment(op.getType());
+      uint64_t index = layout.getTypeIndexBitwidth(op.getType()).value_or(0);
       Attribute allocaMemorySpace = layout.getAllocaMemorySpace();
       Attribute programMemorySpace = layout.getProgramMemorySpace();
       Attribute globalMemorySpace = layout.getGlobalMemorySpace();
-      unsigned stackAlignment = layout.getStackAlignment();
+      uint64_t stackAlignment = layout.getStackAlignment();
       op->setAttrs(
           {builder.getNamedAttr("size", builder.getIndexAttr(size)),
            builder.getNamedAttr("bitsize", builder.getIndexAttr(bitsize)),
            builder.getNamedAttr("alignment", builder.getIndexAttr(alignment)),
            builder.getNamedAttr("preferred", builder.getIndexAttr(preferred)),
+           builder.getNamedAttr("index", builder.getIndexAttr(index)),
            builder.getNamedAttr("alloca_memory_space",
                                 allocaMemorySpace == Attribute()
                                     ? builder.getUI32IntegerAttr(0)
diff --git a/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp b/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp
index b907f77e9108..ae4f77f5873e 100644
--- a/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp
@@ -16,8 +16,8 @@
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tensor/Transforms/TransformUtils.h"
 #include "mlir/Dialect/Tensor/Transforms/Transforms.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
diff --git a/mlir/test/lib/Dialect/Test/TestTypeDefs.td b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
index 1957845c842f..492642b711e0 100644
--- a/mlir/test/lib/Dialect/Test/TestTypeDefs.td
+++ b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
@@ -148,7 +148,8 @@ def TestType : Test_Type<"Test", [
 }
 
 def TestTypeWithLayoutType : Test_Type<"TestTypeWithLayout", [
-  DeclareTypeInterfaceMethods<DataLayoutTypeInterface, ["areCompatible"]>
+  DeclareTypeInterfaceMethods<DataLayoutTypeInterface, ["getIndexBitwidth",
+                                                        "areCompatible"]>
 ]> {
   let mnemonic = "test_type_with_layout";
   let parameters = (ins "unsigned":$key);
diff --git a/mlir/test/lib/Dialect/Test/TestTypes.cpp b/mlir/test/lib/Dialect/Test/TestTypes.cpp
index 2f4c9b689069..7a195eb25a3b 100644
--- a/mlir/test/lib/Dialect/Test/TestTypes.cpp
+++ b/mlir/test/lib/Dialect/Test/TestTypes.cpp
@@ -276,6 +276,12 @@ uint64_t TestTypeWithLayoutType::getPreferredAlignment(
   return extractKind(params, "preferred");
 }
 
+std::optional<uint64_t>
+TestTypeWithLayoutType::getIndexBitwidth(const DataLayout &dataLayout,
+                                         DataLayoutEntryListRef params) const {
+  return extractKind(params, "index");
+}
+
 bool TestTypeWithLayoutType::areCompatible(
     DataLayoutEntryListRef oldLayout, DataLayoutEntryListRef newLayout) const {
   unsigned old = extractKind(oldLayout, "alignment");
@@ -297,7 +303,7 @@ TestTypeWithLayoutType::verifyEntries(DataLayoutEntryListRef params,
     (void)kind;
     assert(kind &&
            (kind.getValue() == "size" || kind.getValue() == "alignment" ||
-            kind.getValue() == "preferred") &&
+            kind.getValue() == "preferred" || kind.getValue() == "index") &&
            "unexpected kind");
     assert(llvm::isa<IntegerAttr>(array.getValue().back()));
   }
diff --git a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp
index b9424e06bf03..2b39668035bc 100644
--- a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp
+++ b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.cpp
@@ -15,8 +15,8 @@
 #include "TestTransformStateExtension.h"
 #include "mlir/Dialect/PDL/IR/PDL.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/PDLExtension/PDLExtensionOps.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/PatternMatch.h"
diff --git a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.h b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.h
index 95950e4c5af1..ddc38b993564 100644
--- a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.h
+++ b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.h
@@ -17,8 +17,8 @@
 #include "mlir/Bytecode/BytecodeOpInterface.h"
 #include "mlir/Dialect/PDL/IR/PDLTypes.h"
 #include "mlir/Dialect/Transform/IR/MatchInterfaces.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformTypes.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/IR/OpImplementation.h"
 
 namespace mlir {
diff --git a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.td b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.td
index c00cc560e83e..75134b25882f 100644
--- a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.td
+++ b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.td
@@ -19,7 +19,7 @@ include "mlir/IR/AttrTypeBase.td"
 include "mlir/IR/OpBase.td"
 include "mlir/Dialect/Transform/IR/MatchInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/Dialect/PDL/IR/PDLTypes.td"
 
 def TestTransformTestDialectHandleType
diff --git a/mlir/test/lib/Dialect/Transform/TestTransformDialectInterpreter.cpp b/mlir/test/lib/Dialect/Transform/TestTransformDialectInterpreter.cpp
index 7d7749958564..e936ac5b852b 100644
--- a/mlir/test/lib/Dialect/Transform/TestTransformDialectInterpreter.cpp
+++ b/mlir/test/lib/Dialect/Transform/TestTransformDialectInterpreter.cpp
@@ -13,8 +13,8 @@
 
 #include "TestTransformDialectExtension.h"
 #include "mlir/Dialect/Transform/DebugExtension/DebugExtensionOps.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/Transforms/TransformInterpreterPassBase.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
diff --git a/mlir/test/lib/Dialect/Transform/TestTransformStateExtension.h b/mlir/test/lib/Dialect/Transform/TestTransformStateExtension.h
index 752b3a78141e..0bfa6bed015c 100644
--- a/mlir/test/lib/Dialect/Transform/TestTransformStateExtension.h
+++ b/mlir/test/lib/Dialect/Transform/TestTransformStateExtension.h
@@ -14,7 +14,7 @@
 #ifndef MLIR_TEST_LIB_DIALECT_TRANSFORM_TESTTRANSFORMSTATEEXTENSION_H
 #define MLIR_TEST_LIB_DIALECT_TRANSFORM_TESTTRANSFORMSTATEEXTENSION_H
 
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 
 using namespace mlir;
 
diff --git a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp
index b6a0ad84eee0..335db1a61f47 100644
--- a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp
+++ b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp
@@ -16,7 +16,7 @@
 #include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
 #include "mlir/Dialect/Transform/IR/TransformAttrs.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/OpImplementation.h"
diff --git a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.td b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.td
index f6e577a5c17a..ef42375e5286 100644
--- a/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.td
+++ b/mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.td
@@ -11,7 +11,7 @@
 
 include "mlir/Dialect/SCF/IR/DeviceMappingInterface.td"
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformTypes.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
diff --git a/mlir/unittests/Interfaces/DataLayoutInterfacesTest.cpp b/mlir/unittests/Interfaces/DataLayoutInterfacesTest.cpp
index 794e19710fad..d6b8d7392f32 100644
--- a/mlir/unittests/Interfaces/DataLayoutInterfacesTest.cpp
+++ b/mlir/unittests/Interfaces/DataLayoutInterfacesTest.cpp
@@ -345,6 +345,8 @@ TEST(DataLayout, NullSpec) {
   EXPECT_EQ(layout.getTypeABIAlignment(Float16Type::get(&ctx)), 16u);
   EXPECT_EQ(layout.getTypePreferredAlignment(IntegerType::get(&ctx, 42)), 128u);
   EXPECT_EQ(layout.getTypePreferredAlignment(Float16Type::get(&ctx)), 32u);
+  EXPECT_EQ(layout.getTypeIndexBitwidth(Float16Type::get(&ctx)), std::nullopt);
+  EXPECT_EQ(layout.getTypeIndexBitwidth(IndexType::get(&ctx)), 64u);
 
   EXPECT_EQ(layout.getAllocaMemorySpace(), Attribute());
   EXPECT_EQ(layout.getProgramMemorySpace(), Attribute());
@@ -373,6 +375,8 @@ TEST(DataLayout, EmptySpec) {
   EXPECT_EQ(layout.getTypeABIAlignment(Float16Type::get(&ctx)), 16u);
   EXPECT_EQ(layout.getTypePreferredAlignment(IntegerType::get(&ctx, 42)), 128u);
   EXPECT_EQ(layout.getTypePreferredAlignment(Float16Type::get(&ctx)), 32u);
+  EXPECT_EQ(layout.getTypeIndexBitwidth(Float16Type::get(&ctx)), std::nullopt);
+  EXPECT_EQ(layout.getTypeIndexBitwidth(IndexType::get(&ctx)), 64u);
 
   EXPECT_EQ(layout.getAllocaMemorySpace(), Attribute());
   EXPECT_EQ(layout.getProgramMemorySpace(), Attribute());
@@ -385,6 +389,7 @@ TEST(DataLayout, SpecWithEntries) {
 "dltest.op_with_layout"() { dltest.layout = #dltest.spec<
   #dlti.dl_entry<i42, 5>,
   #dlti.dl_entry<i16, 6>,
+  #dlti.dl_entry<index, 42>,
   #dlti.dl_entry<"dltest.alloca_memory_space", 5 : i32>,
   #dlti.dl_entry<"dltest.program_memory_space", 3 : i32>,
   #dlti.dl_entry<"dltest.global_memory_space", 2 : i32>,
@@ -408,6 +413,8 @@ TEST(DataLayout, SpecWithEntries) {
   EXPECT_EQ(layout.getTypeABIAlignment(Float16Type::get(&ctx)), 8u);
   EXPECT_EQ(layout.getTypePreferredAlignment(IntegerType::get(&ctx, 42)), 16u);
   EXPECT_EQ(layout.getTypePreferredAlignment(Float16Type::get(&ctx)), 16u);
+  EXPECT_EQ(layout.getTypeIndexBitwidth(Float16Type::get(&ctx)), std::nullopt);
+  EXPECT_EQ(layout.getTypeIndexBitwidth(IndexType::get(&ctx)), 42u);
 
   EXPECT_EQ(layout.getTypeSize(IntegerType::get(&ctx, 32)), 32u);
   EXPECT_EQ(layout.getTypeSize(Float32Type::get(&ctx)), 32u);
diff --git a/openmp/libomptarget/plugins-nextgen/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
index b6fc136e8a17..75540f055844 100644
--- a/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
@@ -11,106 +11,9 @@
 ##===----------------------------------------------------------------------===##
 
 add_subdirectory(common)
-
-# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname,
-#                          string tmachine_llvm, string tmachine_triple, string elf_machine_id);
-# - build a plugin for an ELF based generic 64-bit target based on libffi.
-# - tmachine: name of the machine processor as used in the cmake build system.
-# - tmachine_name: name of the machine to be printed with the debug messages.
-# - tmachine_libname: machine name to be appended to the plugin library name.
-# - tmachine_llvm: LLVM triple for the processor
-# - tmachine_triple: GNU target triple
-macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_llvm tmachine_triple elf_machine_id)
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
-  # Define macro to be used as prefix of the runtime messages for this target.
-  add_definitions("-DTARGET_NAME=${tmachine_name}")
-
-  # Define debug prefix. TODO: This should be automatized in the Debug.h but
-  # it requires changing the original plugins.
-  add_definitions(-DDEBUG_PREFIX="TARGET ${tmachine_name} RTL")
-
-  # Define the macro with the ELF e_machine for this target.
-  add_definitions("-DTARGET_ELF_ID=${elf_machine_id}")
-
-  # Define target triple
-  add_definitions("-DLIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE=${tmachine_llvm}")
-
-  add_llvm_library("omptarget.rtl.${tmachine_libname}"
-    SHARED
-
-    ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp
-
-    ADDITIONAL_HEADER_DIRS
-      ${LIBOMPTARGET_INCLUDE_DIR}
-
-    LINK_LIBS
-      PRIVATE
-      PluginCommon
-      ${OPENMP_PTHREAD_LIB}
-
-    NO_INSTALL_RPATH
-    BUILDTREE_ONLY
-  )
-
-  if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
-    libomptarget_say("Building ${tmachine_libname} plugin linked with libffi")
-    if(FFI_STATIC_LIBRARIES)
-      target_link_libraries(
-        "omptarget.rtl.${tmachine_libname}" PRIVATE FFI::ffi_static)
-    else()
-      target_link_libraries(
-        "omptarget.rtl.${tmachine_libname}" PRIVATE FFI::ffi)
-    endif()
-  else()
-     libomptarget_say("Building ${tmachine_libname} plugin for dlopened libffi")
-     target_sources("omptarget.rtl.${tmachine_libname}" PRIVATE
-        ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/dynamic_ffi/ffi.cpp)
-     target_include_directories("omptarget.rtl.${tmachine_libname}" PRIVATE
-        ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/dynamic_ffi)
-  endif()
-
-  if(OMPT_TARGET_DEFAULT AND LIBOMPTARGET_OMPT_SUPPORT)
-    target_link_libraries("omptarget.rtl.${tmachine_libname}" PRIVATE OMPT)
-  endif()
-
-  if(LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
-    target_link_libraries("omptarget.rtl.${tmachine_libname}" PRIVATE
-      "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
-  endif()
-
-  # Install plugin under the lib destination folder.
-  install(TARGETS "omptarget.rtl.${tmachine_libname}"
-          LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
-  set_target_properties("omptarget.rtl.${tmachine_libname}" PROPERTIES
-    INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
-    POSITION_INDEPENDENT_CODE ON
-    CXX_VISIBILITY_PRESET protected)
-
-  target_include_directories("omptarget.rtl.${tmachine_libname}" PRIVATE
-                             ${LIBOMPTARGET_INCLUDE_DIR})
-
-  if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
-    list(APPEND LIBOMPTARGET_TESTED_PLUGINS "omptarget.rtl.${tmachine_libname}")
-    set(LIBOMPTARGET_TESTED_PLUGINS
-        "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
-    set(LIBOMPTARGET_SYSTEM_TARGETS
-        "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}
-        ${tmachine_triple}-LTO" PARENT_SCOPE)
-  else()
-    libomptarget_say("Not generating ${tmachine_name} tests. LibFFI not found.")
-  endif()
-else()
-  libomptarget_say("Not building ${tmachine_name} NextGen offloading plugin: machine not found in the system.")
-endif()
-endmacro()
-
-add_subdirectory(aarch64)
 add_subdirectory(amdgpu)
 add_subdirectory(cuda)
-add_subdirectory(ppc64)
-add_subdirectory(ppc64le)
-add_subdirectory(x86_64)
-add_subdirectory(s390x)
+add_subdirectory(host)
 
 # Make sure the parent scope can see the plugins that will be created.
 set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
diff --git a/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt
deleted file mode 100644
index 663ab4d60ff9..000000000000
--- a/openmp/libomptarget/plugins-nextgen/aarch64/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for an aarch64 machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183")
-else()
- libomptarget_say("Not building aarch64 NextGen offloading plugin: machine not found in the system.")
-endif()
diff --git a/openmp/libomptarget/plugins-nextgen/host/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/host/CMakeLists.txt
new file mode 100644
index 000000000000..58a79898ff80
--- /dev/null
+++ b/openmp/libomptarget/plugins-nextgen/host/CMakeLists.txt
@@ -0,0 +1,109 @@
+if(NOT CMAKE_SYSTEM_NAME MATCHES "Linux")
+  return()
+endif()
+
+  # build_generic_elf64("s390x" "S390X" "s390x" "systemz" "s390x-ibm-linux-gnu" "22")
+  # build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183")
+  # build_generic_elf64("ppc64" "PPC64" "ppc64" "ppc64" "powerpc64-ibm-linux-gnu" "21")
+  # build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62")
+  # build_generic_elf64("ppc64le" "PPC64le" "ppc64" "ppc64le" "powerpc64le-ibm-linux-gnu" "21")
+set(supported_targets x86_64 aarch64 ppc64 ppc64le s390x)
+if(NOT ${CMAKE_SYSTEM_PROCESSOR} IN_LIST supported_targets)
+  libomptarget_say("Not building ${machine} NextGen offloading plugin")
+  return()
+endif()
+
+set(machine ${CMAKE_SYSTEM_PROCESSOR})
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le$")
+  set(machine ppc64)
+endif()
+
+add_llvm_library(omptarget.rtl.${machine} SHARED
+  src/rtl.cpp
+  ADDITIONAL_HEADER_DIRS
+    ${LIBOMPTARGET_INCLUDE_DIR}
+  LINK_LIBS PRIVATE
+    PluginCommon
+    ${OPENMP_PTHREAD_LIB}
+  NO_INSTALL_RPATH
+  BUILDTREE_ONLY
+)
+
+if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+  libomptarget_say("Building ${machine} plugin linked with libffi")
+  if(FFI_STATIC_LIBRARIES)
+    target_link_libraries(omptarget.rtl.${machine} PRIVATE FFI::ffi_static)
+  else()
+    target_link_libraries(omptarget.rtl.${machine} PRIVATE FFI::ffi)
+  endif()
+else()
+  libomptarget_say("Building ${machine} plugin for dlopened libffi")
+  target_sources(omptarget.rtl.${machine} PRIVATE dynamic_ffi/ffi.cpp)
+  target_include_directories(omptarget.rtl.${machine} PRIVATE dynamic_ffi)
+endif()
+
+if(OMPT_TARGET_DEFAULT AND LIBOMPTARGET_OMPT_SUPPORT)
+  target_link_libraries(omptarget.rtl.${machine} PRIVATE OMPT)
+endif()
+
+if(LIBOMP_HAVE_VERSION_SCRIPT_FLAG)
+  target_link_libraries(omptarget.rtl.${machine} PRIVATE
+    "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
+endif()
+
+# Install plugin under the lib destination folder.
+install(TARGETS omptarget.rtl.${machine}
+        LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+set_target_properties(omptarget.rtl.${machine} PROPERTIES
+  INSTALL_RPATH "$ORIGIN" BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/.."
+  POSITION_INDEPENDENT_CODE ON
+  CXX_VISIBILITY_PRESET protected)
+
+target_include_directories(omptarget.rtl.${machine} PRIVATE
+                           ${LIBOMPTARGET_INCLUDE_DIR})
+
+if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
+  list(APPEND LIBOMPTARGET_TESTED_PLUGINS omptarget.rtl.${machine})
+  set(LIBOMPTARGET_TESTED_PLUGINS
+      "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
+else()
+  libomptarget_say("Not generating ${tmachine_name} tests. LibFFI not found.")
+endif()
+
+# Define macro to be used as prefix of the runtime messages for this target.
+target_compile_definitions(omptarget.rtl.${machine} PRIVATE TARGET_NAME=${machine})
+# TODO: This should be automatized in Debug.h.
+target_compile_definitions(omptarget.rtl.${machine} PRIVATE
+                           DEBUG_PREFIX="TARGET ${machine} RTL")
+
+# Define the target specific triples and ELF machine values.
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le$" OR
+   CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64$")
+  target_compile_definitions(omptarget.rtl.${machine} PRIVATE TARGET_ELF_ID=EM_PPC64)
+  target_compile_definitions(omptarget.rtl.${machine} PRIVATE
+      LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="powerpc64-ibm-linux-gnu")
+  list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
+       "powerpc64-ibm-linux-gnu" "powerpc64-ibm-linux-gnu-LTO")
+  set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64$")
+  target_compile_definitions(omptarget.rtl.${machine} PRIVATE TARGET_ELF_ID=EM_X86_64)
+  target_compile_definitions(omptarget.rtl.${machine} PRIVATE
+      LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="x86_64-pc-linux-gnu")
+  list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
+       "x86_64-pc-linux-gnu" "x86_64-pc-linux-gnu-LTO")
+  set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64$")
+  target_compile_definitions(omptarget.rtl.${machine} PRIVATE TARGET_ELF_ID=EM_AARCH64)
+  target_compile_definitions(omptarget.rtl.${machine} PRIVATE
+      LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="aarch64-unknown-linux-gnu")
+  list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
+       "aarch64-unknown-linux-gnu" "aarch64-unknown-linux-gnu-LTO")
+  set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x$")
+  target_compile_definitions(omptarget.rtl.${machine} PRIVATE TARGET_ELF_ID=EM_S390)
+  target_compile_definitions(omptarget.rtl.${machine} PRIVATE
+      LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE="s390x-ibm-linux-gnu")
+  list(APPEND LIBOMPTARGET_SYSTEM_TARGETS 
+       "s390x-ibm-linux-gnu" "s390x-ibm-linux-gnu-LTO")
+  set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
+endif()
diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/dynamic_ffi/ffi.cpp b/openmp/libomptarget/plugins-nextgen/host/dynamic_ffi/ffi.cpp
index c79daa798581..c79daa798581 100644
--- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/dynamic_ffi/ffi.cpp
+++ b/openmp/libomptarget/plugins-nextgen/host/dynamic_ffi/ffi.cpp
diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/dynamic_ffi/ffi.h b/openmp/libomptarget/plugins-nextgen/host/dynamic_ffi/ffi.h
index 0ae025805e1d..0ae025805e1d 100644
--- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/dynamic_ffi/ffi.h
+++ b/openmp/libomptarget/plugins-nextgen/host/dynamic_ffi/ffi.h
diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/host/src/rtl.cpp
index 38fc275804fa..1ef18814a26a 100644
--- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/host/src/rtl.cpp
@@ -35,7 +35,12 @@
 
 // The ELF ID should be defined at compile-time by the build system.
 #ifndef TARGET_ELF_ID
-#define TARGET_ELF_ID ELF::EM_NONE
+#define TARGET_ELF_ID EM_NONE
+#endif
+
+// The target triple should be defined at compile-time by the build system.
+#ifndef LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE
+#define LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE ""
 #endif
 
 namespace llvm {
@@ -395,7 +400,7 @@ struct GenELF64PluginTy final : public GenericPluginTy {
   Error deinitImpl() override { return Plugin::success(); }
 
   /// Get the ELF code to recognize the compatible binary images.
-  uint16_t getMagicElfBits() const override { return TARGET_ELF_ID; }
+  uint16_t getMagicElfBits() const override { return ELF::TARGET_ELF_ID; }
 
   /// This plugin does not support exchanging data between two devices.
   bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override {
@@ -406,7 +411,7 @@ struct GenELF64PluginTy final : public GenericPluginTy {
   Expected<bool> isELFCompatible(StringRef) const override { return true; }
 
   Triple::ArchType getTripleArch() const override {
-    return Triple::LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE;
+    return llvm::Triple(LIBOMPTARGET_NEXTGEN_GENERIC_PLUGIN_TRIPLE).getArch();
   }
 };
 
diff --git a/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt
deleted file mode 100644
index 77466c111ee0..000000000000
--- a/openmp/libomptarget/plugins-nextgen/ppc64/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a ppc64 machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("ppc64" "PPC64" "ppc64" "ppc64" "powerpc64-ibm-linux-gnu" "21")
-else()
- libomptarget_say("Not building ppc64 NextGen offloading plugin: machine not found in the system.")
-endif()
diff --git a/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt
deleted file mode 100644
index 91d21627a327..000000000000
--- a/openmp/libomptarget/plugins-nextgen/ppc64le/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a ppc64le machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("ppc64le" "PPC64le" "ppc64" "ppc64le" "powerpc64le-ibm-linux-gnu" "21")
-else()
- libomptarget_say("Not building ppc64le NextGen offloading plugin: machine not found in the system.")
-endif()
diff --git a/openmp/libomptarget/plugins-nextgen/s390x/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/s390x/CMakeLists.txt
deleted file mode 100644
index 0388a235d289..000000000000
--- a/openmp/libomptarget/plugins-nextgen/s390x/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a s390x machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
- build_generic_elf64("s390x" "S390X" "s390x" "systemz" "s390x-ibm-linux-gnu" "22")
-else()
- libomptarget_say("Not building s390x NextGen offloading plugin: machine not found in the system.")
-endif()
diff --git a/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt
deleted file mode 100644
index 27cf3e069a37..000000000000
--- a/openmp/libomptarget/plugins-nextgen/x86_64/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-##===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-##===----------------------------------------------------------------------===##
-#
-# Build a plugin for a x86_64 machine if available.
-#
-##===----------------------------------------------------------------------===##
-
-if(CMAKE_SYSTEM_NAME MATCHES "Linux")
-  build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62")
-else()
- libomptarget_say("Not building x86_64 NextGen offloading plugin: machine not found in the system.")
-endif()
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 40cfb1f470db..0d531a3dc12a 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -97,6 +97,11 @@ libc_support_library(
     deps = [":llvm_libc_macros_float_macros"],
 )
 
+libc_support_library(
+    name = "llvm_libc_macros_fcntl_macros",
+    hdrs = ["include/llvm-libc-macros/linux/fcntl-macros.h"],
+)
+
 ############################### Support libraries ##############################
 
 libc_support_library(
@@ -339,6 +344,7 @@ libc_support_library(
         "src/__support/CPP/type_traits/is_base_of.h",
         "src/__support/CPP/type_traits/is_class.h",
         "src/__support/CPP/type_traits/is_const.h",
+        "src/__support/CPP/type_traits/is_constant_evaluated.h",
         "src/__support/CPP/type_traits/is_convertible.h",
         "src/__support/CPP/type_traits/is_destructible.h",
         "src/__support/CPP/type_traits/is_enum.h",
@@ -743,12 +749,12 @@ libc_support_library(
     deps = [
         ":__support_common",
         ":__support_cpp_bit",
-        ":__support_sign",
         ":__support_cpp_type_traits",
         ":__support_libc_assert",
         ":__support_macros_attributes",
         ":__support_macros_properties_types",
         ":__support_math_extras",
+        ":__support_sign",
         ":__support_uint128",
     ],
 )
@@ -3262,6 +3268,18 @@ libc_function(
     ],
 )
 
+libc_function(
+    name = "rename",
+    srcs = ["src/stdio/linux/rename.cpp"],
+    hdrs = ["src/stdio/rename.h"],
+    deps = [
+        ":__support_common",
+        ":__support_osutil_syscall",
+        ":errno",
+        ":llvm_libc_macros_fcntl_macros",
+    ],
+)
+
 ############################### sys/stat targets ###############################
 
 libc_function(
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 4802daa66286..07c5a00c07d7 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -2585,6 +2585,7 @@ cc_library(
     hdrs = glob([
         "include/llvm/Passes/*.h",
         "include/llvm/Passes/*.def",
+        "include/llvm/Passes/*.inc",
     ]) + ["include/llvm-c/Transforms/PassBuilder.h"],
     copts = llvm_copts,
     deps = [
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index ba3f60380d34..3b575d4a413c 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -330,16 +330,16 @@ cc_library(
         "lib/Bytecode/*.h",
     ]) + [
         "include/mlir/IR/PDLPatternMatch.h.inc",
+        "include/mlir/Interfaces/CallInterfaces.h",
+        "include/mlir/Interfaces/DataLayoutInterfaces.h",
+        "include/mlir/Interfaces/SideEffectInterfaces.h",
         "lib/Bytecode/BytecodeOpInterface.cpp",
     ],
     hdrs = glob([
         "include/mlir/IR/*.h",
         "include/mlir/Bytecode/*.h",
     ]) + [
-        "include/mlir/Interfaces/CallInterfaces.h",
-        "include/mlir/Interfaces/DataLayoutInterfaces.h",
         "include/mlir/Interfaces/FoldInterfaces.h",
-        "include/mlir/Interfaces/SideEffectInterfaces.h",
     ],
     includes = ["include"],
     deps = [
@@ -760,6 +760,7 @@ mlir_c_api_cc_library(
     includes = ["include"],
     deps = [
         ":TransformDialect",
+        ":TransformDialectInterfaces",
         ":TransformDialectTransforms",
     ],
 )
@@ -1464,6 +1465,7 @@ cc_library(
         ":FuncDialect",
         ":IR",
         ":TransformDialect",
+        ":TransformDialectInterfaces",
         ":Transforms",
         ":VectorDialect",
     ],
@@ -1572,6 +1574,7 @@ cc_library(
         ":FuncDialect",
         ":IR",
         ":TransformDialect",
+        ":TransformDialectInterfaces",
         ":VectorDialect",
     ],
 )
@@ -1643,6 +1646,7 @@ cc_library(
         ":IR",
         ":MemRefDialect",
         ":Pass",
+        ":SideEffectInterfaces",
         ":Support",
         ":TransformUtils",
         ":Transforms",
@@ -2922,6 +2926,7 @@ cc_library(
         ":SCFUtils",
         ":SideEffectInterfaces",
         ":TransformDialect",
+        ":TransformDialectInterfaces",
         ":VectorDialect",
         "//llvm:Support",
     ],
@@ -3127,6 +3132,7 @@ cc_library(
         ":DialectUtils",
         ":IR",
         ":InferTypeOpInterface",
+        ":SideEffectInterfaces",
         ":SparseTensorAttrDefsIncGen",
         ":SparseTensorEnums",
         ":SparseTensorInterfacesIncGen",
@@ -3153,6 +3159,7 @@ cc_library(
         ":SparseTensorTransformOpsIncGen",
         ":Support",
         ":TransformDialect",
+        ":TransformDialectInterfaces",
         "//llvm:Support",
     ],
 )
@@ -3580,6 +3587,7 @@ cc_library(
         ":SCFTransforms",
         ":Support",
         ":TransformDialect",
+        ":TransformDialectInterfaces",
         ":VectorDialect",
         "//llvm:Support",
     ],
@@ -3757,6 +3765,7 @@ cc_library(
         ":DialectUtils",
         ":IR",
         ":ShapedOpInterfaces",
+        ":SideEffectInterfaces",
         ":ViewLikeInterface",
         ":XeGPUIncGen",
         "//llvm:Core",
@@ -4011,6 +4020,7 @@ cc_library(
         ":AffineDialect",
         ":Analysis",
         ":ArithDialect",
+        ":CallOpInterfaces",
         ":DialectUtils",
         ":FuncDialect",
         ":IR",
@@ -4094,6 +4104,7 @@ cc_library(
         ":Pass",
         ":SCFDialect",
         ":SCFUtils",
+        ":SideEffectInterfaces",
         ":Support",
         ":TensorDialect",
         ":Transforms",
@@ -4175,6 +4186,7 @@ cc_library(
         ":MathToLLVM",
         ":MathToLibm",
         ":MathToSPIRV",
+        ":MemRefToEmitC",
         ":MemRefToLLVM",
         ":MemRefToSPIRV",
         ":NVGPUToNVVM",
@@ -4854,6 +4866,7 @@ cc_library(
         ":LLVMCommonConversion",
         ":LLVMDialect",
         ":TransformDialect",
+        ":TransformDialectInterfaces",
         ":TransformUtils",
     ],
 )
@@ -4995,6 +5008,7 @@ cc_library(
         ":LLVMDialect",
         ":SideEffectInterfaces",
         ":TransformDialect",
+        ":TransformDialectInterfaces",
         ":TransformUtils",
         ":VectorDialect",
         ":VectorEnumsIncGen",
@@ -5785,6 +5799,7 @@ cc_library(
         ":SideEffectInterfaces",
         ":Support",
         ":TransformDialect",
+        ":TransformDialectInterfaces",
         ":TransformUtils",
         ":VectorDialect",
         ":VectorTransforms",
@@ -6938,6 +6953,7 @@ cc_library(
     ]),
     includes = ["include"],
     deps = [
+        ":CallOpInterfaces",
         ":CommonFolders",
         ":ControlFlowInterfaces",
         ":FunctionInterfaces",
@@ -7538,6 +7554,7 @@ cc_library(
         ":TensorTransforms",
         ":TensorUtils",
         ":TransformDialect",
+        ":TransformDialectInterfaces",
         ":TransformUtils",
     ],
 )
@@ -7581,6 +7598,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":Analysis",
+        ":CallOpInterfaces",
         ":ControlFlowInterfaces",
         ":FunctionInterfaces",
         ":IR",
@@ -7911,6 +7929,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":Analysis",
+        ":CallOpInterfaces",
         ":ControlFlowInterfaces",
         ":FunctionInterfaces",
         ":IR",
@@ -7920,6 +7939,7 @@ cc_library(
         ":Pass",
         ":Rewrite",
         ":RuntimeVerifiableOpInterface",
+        ":SideEffectInterfaces",
         ":Support",
         ":TransformUtils",
         ":TransformsPassIncGen",
@@ -8071,6 +8091,7 @@ cc_library(
     hdrs = glob(["include/mlir/Conversion/LLVMCommon/*.h"]),
     includes = ["include"],
     deps = [
+        ":DataLayoutInterfaces",
         ":IR",
         ":LLVMDialect",
         ":Support",
@@ -8237,6 +8258,32 @@ cc_library(
 )
 
 cc_library(
+    name = "MemRefToEmitC",
+    srcs = glob([
+        "lib/Conversion/MemRefToEmitC/*.cpp",
+        "lib/Conversion/MemRefToEmitC/*.h",
+    ]),
+    hdrs = glob([
+        "include/mlir/Conversion/MemRefToEmitC/*.h",
+    ]),
+    includes = [
+        "include",
+        "lib/Conversion/MemRefToEmitC",
+    ],
+    deps = [
+        ":ConversionPassIncGen",
+        ":EmitCDialect",
+        ":MemRefDialect",
+        ":IR",
+        ":Pass",
+        ":Support",
+        ":TransformUtils",
+        ":Transforms",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
     name = "MemRefToLLVM",
     srcs = glob(["lib/Conversion/MemRefToLLVM/*.cpp"]),
     hdrs = glob(["include/mlir/Conversion/MemRefToLLVM/*.h"]),
@@ -9042,6 +9089,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":DLTIDialect",
+        ":DataLayoutInterfaces",
         ":IR",
         ":LLVMConversionIncGen",
         ":LLVMDialect",
@@ -11070,6 +11118,7 @@ cc_library(
         ":TensorUtils",
         ":TilingInterface",
         ":TransformDialect",
+        ":TransformDialectInterfaces",
         ":TransformDialectUtils",
         ":TransformUtils",
         ":VectorDialect",
@@ -11913,6 +11962,15 @@ cc_library(
 )
 
 td_library(
+    name = "TransformInterfacesTdFiles",
+    srcs = glob(["include/mlir/Dialect/Transform/Interfaces/*.td"]),
+    deps = [
+        ":OpBaseTdFiles",
+        ":SideEffectInterfacesTdFiles",
+    ],
+)
+
+td_library(
     name = "TransformDialectTdFiles",
     srcs = glob(["include/mlir/Dialect/Transform/IR/*.td"]),
     deps = [
@@ -11921,6 +11979,7 @@ td_library(
         ":InferTypeOpInterfaceTdFiles",
         ":OpBaseTdFiles",
         ":SideEffectInterfacesTdFiles",
+        ":TransformInterfacesTdFiles",
     ],
 )
 
@@ -11976,29 +12035,29 @@ gentbl_cc_library(
             [
                 "-gen-op-interface-decls",
             ],
-            "include/mlir/Dialect/Transform/IR/TransformInterfaces.h.inc",
+            "include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.h.inc",
         ),
         (
             [
                 "-gen-op-interface-defs",
             ],
-            "include/mlir/Dialect/Transform/IR/TransformInterfaces.cpp.inc",
+            "include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.cpp.inc",
         ),
         (
             [
                 "-gen-type-interface-decls",
             ],
-            "include/mlir/Dialect/Transform/IR/TransformTypeInterfaces.h.inc",
+            "include/mlir/Dialect/Transform/Interfaces/TransformTypeInterfaces.h.inc",
         ),
         (
             [
                 "-gen-type-interface-defs",
             ],
-            "include/mlir/Dialect/Transform/IR/TransformTypeInterfaces.cpp.inc",
+            "include/mlir/Dialect/Transform/Interfaces/TransformTypeInterfaces.cpp.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Dialect/Transform/IR/TransformInterfaces.td",
+    td_file = "include/mlir/Dialect/Transform/Interfaces/TransformInterfaces.td",
     deps = [":TransformDialectTdFiles"],
 )
 
@@ -12063,19 +12122,16 @@ gentbl_cc_library(
 
 cc_library(
     name = "TransformDialectInterfaces",
-    # FIXME: Change this once https://github.com/llvm/llvm-project/pull/85221 lands
-    hdrs = [
-        "include/mlir/Dialect/Transform/IR/TransformInterfaces.h",
-        "include/mlir/Dialect/Transform/IR/TransformTypes.h",
-    ],
+    srcs = glob(["lib/Dialect/Transform/Interfaces/*.cpp"]),
+    hdrs = glob(["include/mlir/Dialect/Transform/Interfaces/*.h"]),
     deps = [
         ":CastInterfaces",
         ":IR",
         ":Rewrite",
+        ":SideEffectInterfaces",
         ":Support",
         ":TransformDialectInterfacesIncGen",
         ":TransformDialectUtils",
-        ":TransformTypesIncGen",
         ":Transforms",
         "//llvm:Support",
     ],
@@ -12102,7 +12158,7 @@ cc_library(
         ":Support",
         ":TransformDialectEnumsIncGen",
         ":TransformDialectIncGen",
-        ":TransformDialectInterfacesIncGen",
+        ":TransformDialectInterfaces",
         ":TransformDialectMatchInterfacesIncGen",
         ":TransformDialectUtils",
         ":TransformOpsIncGen",
@@ -12154,6 +12210,7 @@ cc_library(
         ":SideEffectInterfaces",
         ":Support",
         ":TransformDialect",
+        ":TransformDialectInterfaces",
         ":TransformPDLExtensionOpsIncGen",
         "//llvm:Support",
     ],
@@ -12197,6 +12254,7 @@ cc_library(
         ":Support",
         ":TransformDebugExtensionOpsIncGen",
         ":TransformDialect",
+        ":TransformDialectInterfaces",
         "//llvm:Support",
     ],
 )
@@ -12241,6 +12299,7 @@ cc_library(
         ":SideEffectInterfaces",
         ":Support",
         ":TransformDialect",
+        ":TransformDialectInterfaces",
         ":TransformLoopExtensionOpsIncGen",
         ":Transforms",
         "//llvm:Support",
@@ -12285,6 +12344,7 @@ cc_library(
         ":SideEffectInterfaces",
         ":Support",
         ":TransformDialect",
+        ":TransformDialectInterfaces",
         ":TransformDialectTransformsIncGen",
         "//llvm:Support",
     ],
@@ -12659,6 +12719,7 @@ cc_library(
         ":ArithOpsIncGen",
         ":ArithOpsInterfacesIncGen",
         ":BufferizationInterfaces",
+        ":CallOpInterfaces",
         ":CastInterfaces",
         ":CommonFolders",
         ":ControlFlowInterfaces",
@@ -12670,6 +12731,7 @@ cc_library(
         ":InferTypeOpInterface",
         ":InliningUtils",
         ":Pass",
+        ":SideEffectInterfaces",
         ":Support",
         ":UBDialect",
         ":ValueBoundsOpInterfaceIncGen",
@@ -12976,6 +13038,7 @@ cc_library(
         ":ArithDialect",
         ":ArithUtils",
         ":BufferizationInterfaces",
+        ":CallOpInterfaces",
         ":CastInterfaces",
         ":ComplexDialect",
         ":ControlFlowInterfaces",
@@ -12990,6 +13053,7 @@ cc_library(
         ":MemorySlotInterfaces",
         ":RuntimeVerifiableOpInterface",
         ":ShapedOpInterfaces",
+        ":SideEffectInterfaces",
         ":Support",
         ":ValueBoundsOpInterface",
         ":ViewLikeInterface",
@@ -13124,6 +13188,7 @@ cc_library(
         ":NVGPUDialect",
         ":SCFDialect",
         ":TransformDialect",
+        ":TransformDialectInterfaces",
         ":TransformUtils",
         ":VectorDialect",
         ":VectorTransforms",
@@ -13236,6 +13301,7 @@ cc_library(
     hdrs = glob(["include/mlir/Dialect/MLProgram/IR/*.h"]),
     includes = ["include"],
     deps = [
+        ":CallOpInterfaces",
         ":ControlFlowInterfaces",
         ":FunctionInterfaces",
         ":IR",
@@ -13244,6 +13310,7 @@ cc_library(
         ":MLProgramOpsIncGen",
         ":MLProgramTypesIncGen",
         ":Pass",
+        ":SideEffectInterfaces",
         ":Support",
         ":Transforms",
         "//llvm:Support",
@@ -13556,6 +13623,7 @@ cc_library(
         ":SideEffectInterfaces",
         ":TensorDialect",
         ":TransformDialect",
+        ":TransformDialectInterfaces",
         "//llvm:Support",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch5/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch5/BUILD.bazel
index ce48e249489d..2c49d52f1ed0 100644
--- a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch5/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch5/BUILD.bazel
@@ -102,6 +102,7 @@ cc_binary(
         "//mlir:Analysis",
         "//mlir:ArithDialect",
         "//mlir:BytecodeReader",
+        "//mlir:CallOpInterfaces",
         "//mlir:CastInterfaces",
         "//mlir:FuncDialect",
         "//mlir:FuncExtensions",
diff --git a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch6/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch6/BUILD.bazel
index 286c08065645..cd7f7f018166 100644
--- a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch6/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch6/BUILD.bazel
@@ -108,6 +108,7 @@ cc_binary(
         "//mlir:ArithToLLVM",
         "//mlir:BuiltinToLLVMIRTranslation",
         "//mlir:BytecodeReader",
+        "//mlir:CallOpInterfaces",
         "//mlir:CastInterfaces",
         "//mlir:ControlFlowToLLVM",
         "//mlir:ExecutionEngine",
diff --git a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch7/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch7/BUILD.bazel
index f4037cab03f6..c03672eb4136 100644
--- a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch7/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch7/BUILD.bazel
@@ -108,6 +108,7 @@ cc_binary(
         "//mlir:ArithToLLVM",
         "//mlir:BuiltinToLLVMIRTranslation",
         "//mlir:BytecodeReader",
+        "//mlir:CallOpInterfaces",
         "//mlir:CastInterfaces",
         "//mlir:ControlFlowToLLVM",
         "//mlir:ExecutionEngine",
diff --git a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
index 0c3ed22e7360..d6b0832f4c1a 100644
--- a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
@@ -493,6 +493,44 @@ filegroup(
 )
 
 ##---------------------------------------------------------------------------##
+# Index dialect.
+##---------------------------------------------------------------------------##
+
+gentbl_filegroup(
+    name = "IndexOpsPyGen",
+    tbl_outs = [
+        (
+            [
+                "-gen-python-enum-bindings",
+                "-bind-dialect=index",
+            ],
+            "mlir/dialects/_index_enum_gen.py",
+        ),
+        (
+            [
+                "-gen-python-op-bindings",
+                "-bind-dialect=index",
+            ],
+            "mlir/dialects/_index_ops_gen.py",
+        ),
+    ],
+    tblgen = "//mlir:mlir-tblgen",
+    td_file = "mlir/dialects/IndexOps.td",
+    deps = [
+        "//mlir:IndexOpsTdFiles",
+        "//mlir:OpBaseTdFiles",
+    ],
+)
+
+filegroup(
+    name = "IndexOpsPyFiles",
+    srcs = [
+        "mlir/dialects/index.py",
+        ":IndexOpsPyGen",
+    ],
+)
+
+##---------------------------------------------------------------------------##
 # Math dialect.
 ##---------------------------------------------------------------------------##
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index ccfef3f24340..df2392fd8c6e 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -94,12 +94,14 @@ cc_library(
         "//mlir:AffineAnalysis",
         "//mlir:AffineDialect",
         "//mlir:Analysis",
+        "//mlir:CallOpInterfaces",
         "//mlir:ControlFlowInterfaces",
         "//mlir:FuncDialect",
         "//mlir:FunctionInterfaces",
         "//mlir:IR",
         "//mlir:MemRefDialect",
         "//mlir:Pass",
+        "//mlir:SideEffectInterfaces",
         "//mlir:Support",
     ],
 )
@@ -361,6 +363,7 @@ cc_library(
         "//mlir:Pass",
         "//mlir:TransformDebugExtension",
         "//mlir:TransformDialect",
+        "//mlir:TransformDialectInterfaces",
         "//mlir:TransformDialectTransforms",
         "//mlir:TransformPDLExtension",
     ],
@@ -385,6 +388,7 @@ cc_library(
         ":TestTypeDefsIncGen",
         "//llvm:Support",
         "//mlir:ArithDialect",
+        "//mlir:CallOpInterfaces",
         "//mlir:ControlFlowInterfaces",
         "//mlir:CopyOpInterface",
         "//mlir:DLTIDialect",
@@ -505,6 +509,7 @@ cc_library(
         "//mlir:TensorTilingInterfaceImpl",
         "//mlir:TilingInterface",
         "//mlir:TransformDialect",
+        "//mlir:TransformDialectInterfaces",
         "//mlir:Transforms",
     ],
 )
@@ -569,6 +574,7 @@ cc_library(
         "//mlir:Pass",
         "//mlir:SCFDialect",
         "//mlir:SPIRVDialect",
+        "//mlir:SideEffectInterfaces",
         "//mlir:Support",
         "//mlir:Transforms",
     ],
@@ -1008,6 +1014,7 @@ cc_library(
         "//mlir:TensorDialect",
         "//mlir:TensorTransforms",
         "//mlir:TransformDialect",
+        "//mlir:TransformDialectInterfaces",
         "//mlir:Transforms",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel
index 252b9ec951f6..c6b630230bb3 100644
--- a/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel
@@ -40,6 +40,7 @@ cc_test(
     deps = [
         "//llvm:Support",
         "//mlir:BytecodeReader",
+        "//mlir:CallOpInterfaces",
         "//mlir:FunctionInterfaces",
         "//mlir:IR",
         "//mlir:Parser",
author	Cyndy Ishida <cyndy_ishida@apple.com>	2024-03-21 22:47:33 +0000
committer	Paul Kirth <paulkirth@google.com>	2024-03-21 22:47:33 +0000
commit	f324ac802aac76c2423d1bd20a5826bfcd17f5fe (patch)
tree	64f1c1973a9b6dacd990397b594d5e9ffc9e121c
parent	48a148ef47662e96d8167944e9653c62a4ed5339 (diff)
parent	dc74bf7a5412df82223f7062d9a6b814abbfca45 (diff)